[Ex CI] hipSPARSELt monorepo enablement (#5033 )

[Ex CI] increase hipSPARSELt test timeout (#5028 )
HIP 7.0 upcoming changes blog link updated (#5021 )
2026-01-11 07:38:17 -05:00 · 2025-07-11 16:40:18 -04:00 · 2025-07-10 12:04:06 -04:00 · 2025-07-10 09:53:44 -04:00 · 2025-07-09 14:57:53 -04:00 · 2025-07-09 14:38:24 -04:00
96 changed files with 7983 additions and 1373 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -51,7 +51,7 @@ parameters:
 # HIP with AMD backend
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_amd_${{ job.os }}
+  - job: hip_clr_combined_${{ job.os }}_amd
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -121,7 +121,7 @@ jobs:

 # HIP with Nvidia backend
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_nvidia_${{ job.os }}
+  - job: hip_clr_combined_${{ job.os }}_nvidia
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -43,18 +43,20 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
+    - AMDMIGraphX
    - clr
+    - half
+    - hipBLAS-common
+    - hipBLASLt
+    - llvm-project
+    - MIOpen
+    - rocBLAS
+    - rocDecode
+    - rocm-cmake
    - rocminfo
    - rocprofiler-register
-    - half
-    - rocBLAS
-    - MIOpen
-    - AMDMIGraphX
+    - ROCR-Runtime
    - rpp
-    - rocDecode
 - name: rocmTestDependencies
  type: object
  default:
@@ -90,8 +92,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -138,7 +138,6 @@ jobs:
        runRocminfo: false
    - task: Bash@3
      displayName: Build kfdtest
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
@@ -158,7 +157,6 @@ jobs:
        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocrtst
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -55,8 +55,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -86,8 +86,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: HIP_INC_DIR
      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: Tensile
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,10 +32,10 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - libmsgpack-dev
+    - libboost-filesystem-dev
    - libboost-program-options-dev
+    - libmsgpack-dev
 - name: pipModules
  type: object
  default:
@@ -38,75 +57,97 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: Tensile_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - task: Bash@3
-    displayName: Create wheel file
-    inputs:
-      targetType: inline
-      script: python3 setup.py bdist_wheel
-      workingDirectory: $(Build.SourcesDirectory)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-    parameters:
-      sourceDir: $(Build.SourcesDirectory)/dist
-      contentsString: '*.whl'
-      targetDir: $(Build.ArtifactStagingDirectory)
-      clean: false
-  - task: PublishPipelineArtifact@1
-    displayName: 'wheel file Publish'
-    retryCountOnTaskFailure: 3
-    inputs:
-      targetPath: $(Build.ArtifactStagingDirectory)
-  - task: Bash@3
-    displayName: Save pipeline artifact file names
-    inputs:
-      workingDirectory: $(Pipeline.Workspace)
-      targetType: inline
-      script: |
-        whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
-        if [ -n "$whlFile" ]; then
-          echo $(basename "$whlFile") >> pipelineArtifacts.txt
-        fi
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - task: Bash@3
+      displayName: Create wheel file
+      inputs:
+        targetType: inline
+        script: python3 setup.py bdist_wheel
+        workingDirectory: $(Agent.BuildDirectory)/s
+    - task: Bash@3
+      displayName: Rename wheel file with job OS
+      inputs:
+        targetType: inline
+        workingDirectory: $(Agent.BuildDirectory)/s
+        script: |
+          wheelFile=$(find "$(Agent.BuildDirectory)/s/dist" -type f -name "*.whl" | head -n 1)
+          newWheelFile="$(basename "$wheelFile" .whl)-${{ job.os }}.whl"
+          mv "$wheelFile" "$(dirname "$wheelFile")/$newWheelFile"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+      parameters:
+        sourceDir: $(Agent.BuildDirectory)/s/dist
+        contentsString: '*.whl'
+        targetDir: $(Build.ArtifactStagingDirectory)
+        clean: false
+    - task: PublishPipelineArtifact@1
+      displayName: 'wheel file Publish'
+      retryCountOnTaskFailure: 3
+      inputs:
+        targetPath: $(Build.ArtifactStagingDirectory)
+    - task: Bash@3
+      displayName: Save pipeline artifact file names
+      inputs:
+        workingDirectory: $(Pipeline.Workspace)
+        targetType: inline
+        script: |
+          whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
+          if [ -n "$whlFile" ]; then
+            echo $(basename "$whlFile") >> pipelineArtifacts.txt
+          fi
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: Tensile_test_${{ job.target }}
+  - job: Tensile_test_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: Tensile_build
+    dependsOn: Tensile_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -126,20 +167,23 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
      inputs:
-        itemPattern: '**/*.whl'
+        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: pip install
@@ -164,7 +208,7 @@ jobs:
      inputs:
        targetType: inline
        script: tox run -v -e ci -- -m pre_checkin
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -104,7 +104,7 @@ jobs:
      parameters:
        componentName: amdsmi
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/amd_smi/tests/amdsmitst'
+        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -78,8 +78,6 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -1,36 +1,44 @@
 parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
+- name: jobMatrix
+  type: object
+  default:
+    copyJobs:
+      - { os: ubuntu2204, backend: amd }
+      - { os: almalinux8, backend: amd }
+      - { os: ubuntu2204, backend: nvidia }
+      - { os: almalinux8, backend: nvidia }

 # hip and clr are tightly-coupled
 # run this same template for both repos
 # any changes for clr should just trigger HIP pipeline
 jobs:
- job: hip_clr_combined
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-# checkout nothing, just copy artifacts from triggering HIP job
-# and then publish for this clr job or for this hipother job to maintain latest
-  - checkout: none
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
-    parameters:
-      componentName: HIP
-      pipelineId: $(HIP_PIPELINE_ID)
-  - task: Bash@3
-    displayName: Copy HIP artifacts
-    inputs:
-      targetType: inline
-      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.copyJobs }}:
+  - job: hip_clr_combined_${{ job.os }}_${{ job.backend }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    workspace:
+      clean: all
+    steps:
+  # checkout nothing, just copy artifacts from triggering HIP job
+  # and then publish for this clr job or for this hipother job to maintain latest
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
+      parameters:
+        componentName: HIP
+        pipelineId: $(HIP_PIPELINE_ID)
+        fileFilter: ${{ job.os }}*${{ job.backend }}
+    - task: Bash@3
+      displayName: Copy HIP artifacts
+      inputs:
+        targetType: inline
+        script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      inputs:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -59,16 +59,15 @@ parameters:
      sparseCheckoutDir: projects/hipblaslt
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - hipBLAS_common
-        gfx90a:
-          - hipBLAS_common
+        - hipBLAS_common_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: hipBLAS_common_build_${{ job.os }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -32,6 +32,8 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - ccache
+    - gfortran
    - git
    - libdrm-dev
    - libmsgpack-dev
@@ -39,9 +41,6 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
-    - gfortran
-    - libblas-dev
-    - ccache
 - name: pipModules
  type: object
  default:
@@ -78,15 +77,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { pool: rocm-ci_ultra_build_pool, os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -95,17 +98,16 @@ parameters:
      sparseCheckoutDir: projects/rocblas
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - hipBLASLt_build_gfx942
-        gfx90a:
-          - hipBLASLt_build_gfx90a
+        - hipBLASLt_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 300
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -119,7 +121,11 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
-    pool: ${{ variables.ULTRA_BUILD_POOL }}
+    pool: ${{ job.pool }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -127,16 +133,22 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -148,22 +160,17 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-  # Build and install gtest, lapack, hipBLAS-common
-  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
-  # $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
-    - script: mkdir $(Pipeline.Workspace)/deps
-      displayName: Create temp folder for external dependencies
-  # hipBLASLt already has a CMake script for external deps, so we can just run that
-  # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-    - script: cmake $(Pipeline.Workspace)/s/deps
-      displayName: Configure hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: make
-      displayName: Build hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: sudo make install
-      displayName: Install hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
+    - task: Bash@3
+      displayName: Build and install LAPACK
+      inputs:
+        targetType: inline
+        script: |
+          mkdir -p $(Agent.BuildDirectory)/temp-deps
+          cd $(Agent.BuildDirectory)/temp-deps
+          # position-independent LAPACK is required for almalinux8 builds
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          make
+          sudo make install
    - script: |
        mkdir -p $(CCACHE_DIR)
        echo "##vso[task.prependpath]/usr/lib/ccache"
@@ -171,58 +178,58 @@ jobs:
    - task: Cache@2
      displayName: Ccache caching
      inputs:
-        key: hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        key: hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        path: $(CCACHE_DIR)
        restoreKeys: |
-          hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING)
-          hipBLASLt | $(Agent.OS) | ${{ job.target }}
-          hipBLASLt | $(Agent.OS)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }}
+          hipBLASLt | ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DTensile_LOGIC=
-          -DTensile_CPU_THREADS=
-          -DTensile_LIBRARY_FORMAT=msgpack
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-        installLatestCMake: true
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
-        extraCopyDirectories:
-          - deps
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          installLatestCMake: true
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm
+          extraCopyDirectories:
+            - deps

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
      timeoutInMinutes: 300
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -238,6 +245,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -246,12 +254,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -259,6 +271,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './hipblaslt-test'
          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -61,12 +61,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
@@ -76,7 +76,9 @@ jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -82,7 +82,9 @@ jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }} # todo: add OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -154,6 +156,7 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -72,24 +72,23 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - rocFFT:
-      name: rocFFT
-      sparseCheckoutDir: projects/rocfft
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        gfx942:
-          - hipRAND_build_gfx942
-        gfx90a:
-          - hipRAND_build_gfx90a
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - rocFFT:
+#       name: rocFFT
+#       sparseCheckoutDir: projects/rocfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -184,6 +183,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -206,14 +206,14 @@ jobs:
          environment: test
          gpuTarget: ${{ job.target }}

- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -70,8 +70,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipSPARSELt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -64,7 +83,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipSPARSELt_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -91,12 +114,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
@@ -131,8 +157,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -150,44 +178,49 @@ jobs:
          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
        installLatestCMake: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipSPARSELt_test_${{ job.target }}
-    dependsOn: hipSPARSELt_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipSPARSELt
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './hipsparselt-test'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './hipsparselt-test'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -67,7 +67,6 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        skipLlvmSymlink: true
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - git
-    - googletest
    - libboost-program-options-dev
    - libdrm-dev
    - libfftw3-dev
@@ -90,6 +89,10 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        submoduleBehaviour: recursive
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -101,12 +104,11 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
          -DCMAKE_BUILD_TYPE=Release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TESTS=ON
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake;$(Agent.BuildDirectory)/rocm/libexec/hipify
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -86,8 +86,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:  ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -73,8 +73,7 @@ jobs:
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -33,17 +33,15 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - python3-venv
    - git
-    - libmsgpack-dev
    - gfortran
-    - libopenblas-dev
-    - googletest
-    - libgtest-dev
-    - wget
-    - python3-pip
    - libdrm-dev
+    - libmsgpack-dev
+    - libopenblas-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
+    - wget
 - name: pipModules
  type: object
  default:
@@ -52,18 +50,17 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
-    - clr
-    - rocminfo
-    - rocprofiler-register
-    - rocm_smi_lib
-    - rocm-core
    - aomp
-    - aomp-extras
+    - clr
    - hipBLAS-common
    - hipBLASLt
+    - llvm-project
+    - rocm-cmake
+    - rocm-core
+    - rocm_smi_lib
+    - rocminfo
+    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer
 - name: rocmTestDependencies
  type: object
@@ -83,44 +80,51 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 - name: downstreamComponentMatrix
  type: object
  default:
-    # rocSOLVER depends on both rocBLAS and rocPRIM
-    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    - rocSOLVER:
-      name: rocSOLVER
-      sparseCheckoutDir: projects/rocsolver
+   # technically hipSPARSELt is a downstream component of hipSPARSE
+   # since hipSPARSE is not yet enabled, we will trigger it from rocBLAS in the interim
+    - hipSPARSELt:
+      name: hipSPARSELt
+      sparseCheckoutDir: projects/hipsparselt
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocBLAS_build_gfx942
-        gfx90a:
-          - rocBLAS_build_gfx90a
-      unifiedBuild:
-        downstreamAggregateNames: rocBLAS+rocPRIM
-        buildDependsOn:
-          gfx942:
-            - rocBLAS_build_gfx942
-            - rocPRIM_build_gfx942
-          gfx90a:
-            - rocBLAS_build_gfx90a
-            - rocPRIM_build_gfx90a
+        - rocBLAS_build
+    # rocSOLVER depends on both rocBLAS and rocPRIM
+    # for a unified build, rocBLAS will be the one to call rocSOLVER
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'false'
+    #   buildDependsOn:
+    #     - rocBLAS_build
+    #   unifiedBuild:
+    #     downstreamAggregateNames: rocBLAS+rocPRIM
+    #     buildDependsOn:
+    #       - rocBLAS_build
+    #       - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -133,6 +137,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -140,6 +148,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -147,59 +156,62 @@ jobs:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
          -DGPU_TARGETS=${{ job.target }}
-          -DTensile_CODE_OBJECT_VERSION=default
-          -DTensile_LOGIC=asm_full
-          -DTensile_SEPARATE_ARCHITECTURES=ON
-          -DTensile_LAZY_LIBRARY_LOADING=ON
-          -DTensile_LIBRARY_FORMAT=msgpack
          -DBUILD_CLIENTS_TESTS=ON
-          -DBUILD_CLIENTS_BENCHMARKS=OFF
-          -DBUILD_CLIENTS_SAMPLES=OFF
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        installAOCL: true
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          installAOCL: true
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -213,6 +225,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -221,12 +234,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -234,6 +251,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocblas-test'
          testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -251,11 +269,11 @@ jobs:
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
          ${{ if parameters.unifiedBuild }}:
            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
          ${{ else }}:
            buildDependsOn: ${{ component.buildDependsOn }}
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -78,24 +78,23 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - hipFFT:
-      name: hipFFT
-      sparseCheckoutDir: projects/hipfft
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        gfx942:
-          - rocFFT_build_gfx942
-        gfx90a:
-          - rocFFT_build_gfx90a
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - hipFFT:
+#       name: hipFFT
+#       sparseCheckoutDir: projects/hipfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - rocFFT_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }} # todo: un-hardcode OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -167,6 +166,7 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
@@ -196,14 +196,14 @@ jobs:
        environment: test
        gpuTarget: ${{ job.target }}

- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -27,6 +27,7 @@ parameters:
    - numpy
    - tomli
    - scipy
+    - pybind11
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -60,12 +60,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
@@ -82,36 +82,29 @@ parameters:
      sparseCheckoutDir: projects/rocthrust
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+        - rocPRIM_build
    - hipCUB:
      name: hipCUB
      sparseCheckoutDir: projects/hipcub
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+        - rocPRIM_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    - rocSOLVER:
-      name: rocSOLVER
-      sparseCheckoutDir: projects/rocsolver
-      skipUnifiedBuild: 'true'
-      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'true'
+    #   buildDependsOn:
+    #     - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -177,7 +170,7 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_${{ job.shard }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
@@ -217,7 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -36,6 +36,7 @@ parameters:
    - clr
    - llvm-project
    - rocDecode
+    - rocJPEG
    - rocm-cmake
    - rocm-core
    - rocminfo
@@ -192,9 +193,9 @@ jobs:
      inputs:
        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
+        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -221,25 +222,17 @@ jobs:
    - task: CMake@1
      displayName: 'rocPyDecode Test CMake Flags'
      inputs:
+        workingDirectory: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
        cmakeArgs: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(PYTHON_USER_SITE)/pybind11;$(PYTHON_DIST_PACKAGES)/pybind11;$(PYBIND11_PATH)
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
-          ..
+          .
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocPyDecode
-        testDir: $(Build.SourcesDirectory)/build
-  # sudo required for pip install but screws up permissions for next pipeline run
-    - task: Bash@3
-      displayName: Clean up test environment
-      condition: always()
-      inputs:
-        targetType: inline
-        script: |
-          pip uninstall -y rocPyDecode
-          pip uninstall -y hip-python
+        testDir: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -78,16 +78,15 @@ parameters:
      sparseCheckoutDir: projects/hiprand
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocRAND_build_gfx942
-        gfx90a:
-          - rocRAND_build_gfx90a
+        - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -33,13 +33,11 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libsuitesparse-dev
    - gfortran
-    - libfmt-dev
    - git
-    - googletest
-    - libgtest-dev
+    - libfmt-dev
+    - libsuitesparse-dev
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -72,31 +70,42 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -108,10 +117,15 @@ jobs:
        targetType: inline
        script: git clone --depth 1 --branch v3.9.1 https://github.com/Reference-LAPACK/lapack
        workingDirectory: '$(Build.SourcesDirectory)'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -119,8 +133,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: lapack
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
          -DBUILD_TESTING=OFF
          -DCBLAS=ON
@@ -131,8 +147,9 @@ jobs:
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install;$(Agent.BuildDirectory)/vendor
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DAMDGPU_TARGETS=${{ job.target }}
@@ -144,23 +161,26 @@ jobs:
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraCopyDirectories:
-          - deps-install
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraCopyDirectories:
+            - deps-install

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -174,6 +194,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -181,12 +202,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -194,6 +219,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocsolver-test'
          testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -64,12 +64,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
@@ -79,7 +79,9 @@ jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -105,7 +105,7 @@ jobs:
      parameters:
        componentName: rocm_smi_lib
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -67,7 +67,6 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        skipLlvmSymlink: true
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -24,24 +24,28 @@ parameters:
  default:
    - astunparse==1.6.2
    - colorlover
-    - "dash>=1.12.0"
+    - dash-bootstrap-components
+    - dash-svg
+    - "dash>=3.0.0"
+    - kaleido==0.2.1
    - matplotlib
    - "numpy>=1.17.5"
    - "pandas>=1.4.3"
+    - plotext
+    - plotille
    - pymongo
    - pyyaml
-    - tabulate
-    - tqdm
-    - dash-svg
-    - dash-bootstrap-components
-    - kaleido
    - setuptools
-    - plotille
+    - tabulate
+    - textual
+    - textual_plotext
+    - textual-fspicker
+    - tqdm
    - mock
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmDependencies
+- name: rocmTestDependencies
  type: object
  default:
    - amdsmi
@@ -114,14 +118,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: ${{ job.dependencySource }}
-        gpuTarget: ${{ job.target }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -165,14 +161,6 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -184,9 +172,17 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Add en_US.UTF-8 locale
+      inputs:
+        targetType: inline
+        script: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale
+          locale -a
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -14,10 +14,12 @@ parameters:
  type: object
  default:
    - build-essential
+    - cmake
    - libdrm-amdgpu-dev
    - libdrm-dev
    - libdw-dev
    - libelf-dev
+    - libsqlite3-dev
    - libva-dev
    - ninja-build
    - pkg-config
@@ -74,8 +76,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -402,14 +402,11 @@ jobs:
      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    parameters:
-      dependencySource: staging
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmTestDependencies }}
      gpuTarget: $(JOB_GPU_TARGET)
      dependencySource: staging
-      skipLlvmSymlink: true
 # get sources to run test scripts
  - task: Bash@3
    displayName: git clone upstream pytorch
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,12 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - gfx942-staging:
-      target: gfx942
-      source: staging
-    - gfx90a-staging:
-      target: gfx90a
-      source: staging
+    - { os: ubuntu2204, target: gfx942, source: staging }
+    - { os: ubuntu2204, target: gfx90a, source: staging }
+    - { os: ubuntu2204, target: gfx1201, source: staging }
+    - { os: ubuntu2204, target: gfx1100, source: staging }
+    - { os: ubuntu2204, target: gfx1030, source: staging }
+    - { os: ubuntu2404, target: gfx942, source: staging }
+    - { os: ubuntu2404, target: gfx90a, source: staging }
+    - { os: ubuntu2404, target: gfx1201, source: staging }
+    - { os: ubuntu2404, target: gfx1100, source: staging }
+    - { os: ubuntu2404, target: gfx1030, source: staging }
+    - { os: almalinux8, target: gfx942, source: staging }
+    - { os: almalinux8, target: gfx90a, source: staging }
+    - { os: almalinux8, target: gfx1201, source: staging }
+    - { os: almalinux8, target: gfx1100, source: staging }
+    - { os: almalinux8, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -16,9 +25,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
+    - clr
    - composable_kernel
    - half
-    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -83,7 +92,7 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -108,9 +117,8 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
-        skipLibraryLinking: true
-        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/scripts/dependencyGraph.py
+++ b/.azuredevops/scripts/dependencyGraph.py
@@ -1,147 +0,0 @@
-import os
-import yaml
-from graphviz import Digraph
-
-# Set DEBUG to False for normal output, True for debug output
-DEBUG = False
-
-def debug_print(message):
-    if DEBUG:
-        print(message)
-
-import os
-import yaml
-
-def extract_dependencies(exclude_nodes=[]):
-    dependencies = {}
-    debug_print("Extracting dependencies from YAML files...")
-
-    # Define a mapping of specific filenames to component names
-    component_name_mapping = {
-        'HIP.yml': 'clr',  # Remap HIP.yml to clr in graph
-    }
-
-    script_directory = os.path.dirname(os.path.abspath(__file__))
-    yaml_directory = os.path.join(script_directory, '..', 'components')
-
-    for filename in os.listdir(yaml_directory):
-        if filename.endswith(".yaml") or filename.endswith(".yml"):
-            debug_print(f"Processing file: {filename}")
-            try:
-                with open(os.path.join(yaml_directory, filename), 'r') as file:
-                    data = yaml.safe_load(file) or {}
-                    parameters = data.get('parameters', [])
-
-                    # Check for both 'rocmDependencies' and 'rocmDependenciesAMD'
-                    rocm_dependencies = next((param['default'] for param in parameters if param['name'] == 'rocmDependencies' or param['name'] == 'rocmDependenciesAMD'), [])
-                    test_dependencies = next((param['default'] for param in parameters if param['name'] == 'rocmTestDependencies'), [])
-
-                    unique_dependencies = list(set(rocm_dependencies + test_dependencies))
-                    unique_dependencies = [dep for dep in unique_dependencies if dep not in exclude_nodes]
-
-                    # Use the mapped component name if it exists
-                    component_name = component_name_mapping.get(filename, os.path.splitext(filename)[0])
-
-                    if component_name == 'MIOpen':
-                        unique_dependencies.append('composable_kernel')
-
-                    dependencies[component_name] = {
-                        'dependencies': unique_dependencies
-                    }
-                    debug_print(f"Found unique dependencies for {component_name}: {unique_dependencies}")
-            except Exception as e:
-                print(f"Error processing {filename}: {e}")
-
-    return dependencies
-
-def simplify_dependencies(graph):
-    simplified_graph = {}
-
-    for component, deps in graph.items():
-        if component not in simplified_graph:
-            simplified_graph[component] = set(deps)  # Use a set for uniqueness
-
-        for dep in deps:
-            if dep in graph:  # If the dependency has its own dependencies
-                for sub_dep in graph[dep]:
-                    simplified_graph[component].discard(sub_dep)  # Remove transitive dependencies
-
-    # Convert sets back to lists
-    for component in simplified_graph:
-        simplified_graph[component] = list(simplified_graph[component])
-
-    return simplified_graph
-
-def build_dependency_graph(dependencies, exclude_nodes=None):
-    if exclude_nodes is None:
-        exclude_nodes = []
-
-    graph = {}
-    debug_print("Building dependency graph...")
-
-    for component, deps in dependencies.items():
-        if component in exclude_nodes:
-            continue  # Skip excluded components
-
-        # Ensure uniqueness and prevent self-dependency
-        all_deps = [dep for dep in set(deps['dependencies']) if dep != component and dep not in exclude_nodes]
-        graph[component] = all_deps
-        debug_print(f"{component} -> {all_deps}")
-
-    # Simplify the dependencies to remove transitive dependencies
-    simplified_graph = simplify_dependencies(graph)
-
-    return simplified_graph
-
-def build_full_dependency_tree(graph):
-    tree = {}
-    debug_print("Building full dependency tree...")
-
-    def dfs(component, visited):
-        if component in visited:
-            return
-        visited.add(component)
-        for dep in graph.get(component, []):
-            # Prevent self-dependency in the tree
-            if dep != component:
-                if dep not in tree:
-                    tree[dep] = []
-                if component not in tree[dep]:  # Prevent duplicates
-                    tree[dep].append(component)
-                dfs(dep, visited)
-
-    for component in graph.keys():
-        dfs(component, set())
-
-    return tree
-
-def visualize_graph(graph):
-    dot = Digraph()
-
-    for component, deps in graph.items():
-        for dep in deps:
-            dot.edge(component, dep)
-
-    script_directory = os.path.dirname(os.path.abspath(__file__))
-
-    dot.render(os.path.join(script_directory, 'dependency_graph'), format='png', cleanup=True)  # Save as PNG
-
-def main():
-    exclude_deps = ['rocm-examples']
-    dependencies = extract_dependencies(exclude_nodes=exclude_deps)
-
-    if not dependencies:
-        debug_print("No dependencies found.")
-        return
-
-    graph = build_dependency_graph(dependencies, exclude_nodes=exclude_deps)
-    full_tree = build_full_dependency_tree(graph)
-
-    print("Dependency tree:")
-    print(full_tree)
-
-    # Call this function after building the graph
-    visualize_graph(full_tree)
-
-if __name__ == "__main__":
-    main()
--- a/.azuredevops/scripts/dependency_graph.png
+++ b/.azuredevops/scripts/dependency_graph.png
--- a/.azuredevops/tag-builds/clr.yml
+++ b/.azuredevops/tag-builds/clr.yml
@@ -28,12 +28,22 @@ resources:
    endpoint: ROCm
    name: ROCm/hipother
    ref: ${{ parameters.checkoutRef }}
+  pipelines:
+  - pipeline: hip_pipeline
+    source: \experimental\HIP
+    trigger: true
+  - pipeline: hipother_pipeline
+    source: \experimental\hipother
+    trigger: true

 trigger: none
 pr: none

 jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
+  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
+  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
+      parameters:
+        checkoutRepo: release_repo
+        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -12,6 +12,9 @@ parameters:
 - name: fileFilter
  type: string
  default: ''
+- name: extractAndDeleteFiles
+  type: boolean
+  default: true
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -19,46 +22,35 @@ parameters:
  default: false

 steps:
- task: Bash@3
-  displayName: Set allowPartiallySucceededBuilds
-  inputs:
-    targetType: inline
-    script: |
-      if [[ ",$ALLOWED_PARTIAL_SUCCEED_BUILDS," == *",${{ parameters.componentName }},"* ]]; then
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]true"
-      else
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]false"
-      fi
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
  inputs:
-    ${{ if eq(parameters.aggregatePipeline, false) }}:
+    itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
+    targetPath: '$(Pipeline.Workspace)/d'
+    allowPartiallySucceededBuilds: true
+    ${{ if parameters.aggregatePipeline }}:
+      buildType: 'current'
+    ${{ else }}:
      buildType: 'specific'
      project: ROCm-CI
-      definition: ${{ parameters.pipelineId }}
      specificBuildWithTriggering: true
-      itemPattern: '**/*${{ parameters.fileFilter }}*'
-      # aomp is a special case, since the trigger file is under ROCm/ROCm instead of the component repo
-      ${{ if notIn(parameters.componentName, 'aomp') }}:
-        buildVersionToDownload: latestFromBranch # default is 'latest'
+      definition: ${{ parameters.pipelineId }}
      branchName: refs/heads/${{ parameters.branchName }}
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
-    ${{ else }}:
-      buildType: 'current'
-      itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
- task: ExtractFiles@1
-  displayName: Extract ${{ parameters.componentName }}
-  inputs:
-    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-    destinationFolder: '$(Agent.BuildDirectory)/rocm'
-    cleanDestinationFolder: false
-    overwriteExistingFiles: true
- task: DeleteFiles@1
-  displayName: Cleanup Compressed ${{ parameters.componentName }}
-  inputs:
-    SourceFolder: '$(Pipeline.Workspace)/d'
-    Contents: '**/*.tar.gz'
-    RemoveDotFiles: true
+      ${{ if eq(parameters.componentName, 'aomp') }}:
+        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
+      ${{ else }}:
+        buildVersionToDownload: latestFromBranch
+- ${{ if eq(parameters.extractAndDeleteFiles, true) }}:
+  - task: ExtractFiles@1
+    displayName: Extract ${{ parameters.componentName }}
+    inputs:
+      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+      destinationFolder: '$(Agent.BuildDirectory)/rocm'
+      cleanDestinationFolder: false
+      overwriteExistingFiles: true
+  - task: DeleteFiles@1
+    displayName: Clean up Compressed ${{ parameters.componentName }}
+    inputs:
+      SourceFolder: '$(Pipeline.Workspace)/d'
+      Contents: '**/*.tar.gz'
+      RemoveDotFiles: true
--- a/.azuredevops/templates/steps/artifact-links.yml
+++ b/.azuredevops/templates/steps/artifact-links.yml
@@ -15,8 +15,8 @@ steps:
      URL_BEGIN="https://artprodcus3.artifacts.visualstudio.com/"
      URL_MIDDLE="/_apis/artifact/"
      URL_END="/content?format=file&subPath=%2F"
-      FORMATTED_JOB_NAME=$(echo $(Agent.JobName) | sed 's/ /./g; s/[-_]//g')
-      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${FORMATTED_JOB_NAME}"
+      ARTIFACT_NAME="$(Agent.JobName)_$(System.JobAttempt)"
+      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${ARTIFACT_NAME}"
      ENCODED_STRING=$(echo -n "${ARTIFACT_STRING}" | base64 -w 0)
      PADDING_COUNT=$(echo -n "${ENCODED_STRING}" | awk -F= '{print NF-1}')
      if [ "$PADDING_COUNT" -gt 0 ]; then
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -26,7 +26,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -38,7 +38,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
+    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -46,4 +46,5 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
+      artifactName: $(Agent.JobName)_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/dependencies-aocl.yml
+++ b/.azuredevops/templates/steps/dependencies-aocl.yml
@@ -1,10 +1,15 @@
 parameters:
+- name: os
+  type: string
+  default: ubuntu2204
 - name: repositoryUrl
  type: string
  default: https://download.amd.com/developer/eula/aocl/aocl-4-2
 - name: packageName
-  type: string
-  default: aocl-linux-gcc-4.2.0_1_amd64.deb
+  type: object
+  default:
+    ubuntu2204: aocl-linux-gcc-4.2.0_1_amd64.deb
+    almalinux8: aocl-linux-gcc-4.2.0-1.x86_64.rpm

 steps:
 - task: Bash@3
@@ -12,16 +17,19 @@ steps:
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName }}
+    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Install AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: sudo apt install -y ./${{ parameters.packageName }}
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: sudo apt install -y ./${{ parameters.packageName[parameters.os] }}
+    ${{ elseif eq(parameters.os, 'almalinux8') }}:
+      script: sudo dnf install -y ./${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Clean up AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: rm -f ${{ parameters.packageName }}
+    script: rm -f ${{ parameters.packageName[parameters.os] }}
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -52,9 +52,11 @@ parameters:
    libexpat-dev: expat-devel
    libffi-dev: libffi-devel
    libfftw3-dev: fftw-devel
+    libfmt-dev: fmt-devel
    libgmp-dev: gmp-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
+    libmsgpack-dev: msgpack-devel
    libncurses5-dev: ncurses-devel
    libnuma-dev: numactl-devel
    libopenmpi-dev: openmpi-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -19,16 +19,6 @@ parameters:
 - name: gpuTarget
  type: string
  default: ''
-# set to true if you're calling this template file multiple files in same pipeline
-# only leave last call false to optimize sequence
- name: skipLibraryLinking
-  type: boolean
-  default: false
-# set to true if llvm-project is not downloaded in a particular call
-# or if you just don't want the symlink
- name: skipLlvmSymlink
-  type: boolean
-  default: false
 # set to true if dlopen calls for HIP libraries are causing failures
 # because they do not follow shared library symlink convention
 - name: setupHIPLibrarySymlinks
@@ -130,7 +120,7 @@ parameters:
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipSOLVER:
      pipelineId: $(HIPSOLVER_PIPELINE_ID)
@@ -305,7 +295,7 @@ parameters:
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocr_debug_agent:
      pipelineId: $(ROCR_DEBUG_AGENT_PIPELINE_ID)
@@ -367,6 +357,7 @@ steps:
        componentName: ${{ split(dependency, ':')[0] }}
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
        # dependencySource = staging
@@ -397,6 +388,7 @@ steps:
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          gpuTarget: ${{ parameters.gpuTarget }}
        preTargetFilter: ${{ dependency }}
+        os: ${{ parameters.os }}
        buildType: current
  - ${{ else }}:
    - template: artifact-download.yml
@@ -404,6 +396,7 @@ steps:
        componentName: ${{ dependency }}
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
        ${{ else }}:
@@ -429,23 +422,43 @@ steps:
        # default = staging
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
-# Set link to redirect llvm folder
- ${{ if eq(parameters.skipLlvmSymlink, false) }}:
+- task: ExtractFiles@1
+  displayName: Extract ROCm artifacts
+  inputs:
+    archiveFilePatterns: $(Pipeline.Workspace)/d/**/*.tar.gz
+    destinationFolder: $(Agent.BuildDirectory)/rocm
+    cleanDestinationFolder: false
+    overwriteExistingFiles: true
+- task: DeleteFiles@1
+  displayName: Clean up ROCm artifacts
+  inputs:
+    SourceFolder: $(Pipeline.Workspace)/d
+    Contents: '**/*.tar.gz'
+    RemoveDotFiles: true
+- ${{ if containsValue(parameters.dependencyList, 'llvm-project') }}:
  - task: Bash@3
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
      targetType: inline
      script: |
        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
-        sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        echo "Created symlink from rocm/llvm to rocm/lib/llvm"
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          echo "Created symlink from rocm/llvm/bin/$file to rocm/bin/$file"
        done
+- ${{ if containsValue(parameters.dependencyList, 'rocm-core') }}:
+  - task: Bash@3
+    displayName: Print rocm/.info/version
+    inputs:
+      targetType: inline
+      script: cat $(Agent.BuildDirectory)/rocm/.info/version
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
 # unversioned .so is a symlink to major version .so
@@ -482,17 +495,16 @@ steps:
  inputs:
    targetType: inline
    script: ls -la1R $(Agent.BuildDirectory)/rocm
- ${{ if eq(parameters.skipLibraryLinking, false) }}:
-  - task: Bash@3
-    displayName: 'Link ROCm shared libraries'
-    inputs:
-      targetType: inline
-# OS ignores if the ROCm lib folder shows up more than once
-      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-        sudo ldconfig -v
-        ldconfig -p
+- task: Bash@3
+  displayName: 'Link ROCm shared libraries'
+  inputs:
+    targetType: inline
+    # OS ignores if the ROCm lib folder shows up more than once
+    script: |
+      echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -23,13 +23,14 @@ steps:
  inputs:
    targetType: inline
    script: |
-      sudo apt-get install -y jq
+      ${{ iif(or(eq(parameters.os, 'ubuntu2204'), eq(parameters.os, 'ubuntu2404')), 'sudo apt-get install -y jq', '') }}

      # RESOURCES_REPOSITORIES is a runtime variable (not an env var!) that contains quotations and newlines
      # So we need to save it to a file to properly preserve its formatting and contents
      cat <<EOF > resources.repositories
      $(RESOURCES_REPOSITORIES)
      EOF
+      echo "Value of resources.repositories:"
      cat resources.repositories

      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
@@ -66,8 +67,6 @@ steps:
        )
      ' resources.repositories)

-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
-
      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
        echo "Processing $manifest_file"
@@ -78,6 +77,10 @@ steps:
      done
      dependencies_json=$(printf '%s\n' "${dependencies[@]}" | jq -s '.')

+      manifest_filename="manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}"
+      echo "##vso[task.setvariable variable=manifest_filename]$manifest_filename"
+      manifest_json=$(Build.ArtifactStagingDirectory)/$manifest_filename.json
+
      jq -n \
        --argjson current "$current" \
        --argjson dependencies "$dependencies_json" \
@@ -111,8 +114,14 @@ steps:
        ')
      dependencies_rows=$(echo $dependencies_rows)
      echo "##vso[task.setvariable variable=dependencies_rows;]$dependencies_rows"
-
-      cat $manifest_json
+- task: Bash@3
+  displayName: Print manifest.json
+  condition: always()
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: |
+      cat $(Build.ArtifactStagingDirectory)/$(manifest_filename).json
 - task: Bash@3
  displayName: Create manifest.html
  condition: always()
@@ -120,10 +129,10 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html="$(Build.ArtifactStagingDirectory)/$(manifest_filename).html"
      cat <<EOF > $manifest_html
      <html>
-      <h1>Manifest</h1>
+      <h1>$(manifest_filename)</h1>
      <h2>Current</h2>
      <table border="1">
      <tr>
@@ -163,7 +172,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/$(manifest_filename).html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -172,5 +181,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).html" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -17,7 +17,6 @@ steps:
    script: |
      AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
      GH_API="https://api.github.com/repos/ROCm"
-      ARTIFACT_NAME="composablekernelbuild${{ parameters.gpuTarget }}"
      EXIT_CODE=0

      # Try to find an Azure build for the specific CK commit called out in MIOpen's requirements.txt
@@ -39,8 +38,15 @@ steps:
        echo "Found specific CK build ID: $CK_BUILD_ID"
      fi

-      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-      ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+      ARTIFACT_URL=$(curl -s $AZURE_URL | \
+        jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
+          .value
+          | map(select(.name | test($os) and test($gfx)))
+          | max_by(.name | capture("drop_(?<dropNumber>\\d+)").dropNumber | tonumber)
+          | .resource.downloadUrl
+        ' | \
+        tr -d '"')

      # If using the specific CK commit and it doesn't have any valid artifacts, use latest successful CK build instead
      if { [[ -z "$ARTIFACT_URL" ]] || [[ "$ARTIFACT_URL" == "null" ]]; } && [[ $EXIT_CODE -eq 0 ]]; then
@@ -48,8 +54,15 @@ steps:
        LATEST_BUILD_URL="$AZ_API/build/builds?definitions=$(COMPOSABLE_KERNEL_PIPELINE_ID)&statusFilter=completed&resultFilter=succeeded&\$top=1&api-version=7.1"
        CK_BUILD_ID=$(curl -s $LATEST_BUILD_URL | jq '.value[0].id')
        echo "Found latest CK build ID: $CK_BUILD_ID"
-        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-        ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+        ARTIFACT_URL=$(curl -s $AZURE_URL | \
+          jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
+            .value
+            | map(select(.name | test($os) and test($gfx)))
+            | max_by(.name | capture("drop_(?<dropNumber>\\d+)").dropNumber | tonumber)
+            | .resource.downloadUrl
+          ' | \
+          tr -d '"')
        EXIT_CODE=2
      fi

@@ -57,8 +70,8 @@ steps:
      wget --tries=5 --waitretry=10 --retry-connrefused -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip
      unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
      mkdir -p $(Agent.BuildDirectory)/rocm
-      tar -zxvf $(System.ArtifactsDirectory)/$ARTIFACT_NAME/*.tar.gz -C $(Agent.BuildDirectory)/rocm
-      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/$ARTIFACT_NAME
+      tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
+      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*

      if [[ $EXIT_CODE -ne 0 ]]; then
        BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -1,19 +1,19 @@
 parameters:
- name: os
-  type: string
-  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
+- name: os
+  type: string
+  default: ubuntu2204
 - name: testDir
  type: string
-  default: 'build'
+  default: build
 - name: testExecutable
  type: string
-  default: 'ctest'
+  default: ctest
 - name: testParameters
  type: string
-  default: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
 - name: extraTestParameters
  type: string
  default: ''
@@ -22,7 +22,7 @@ parameters:
  default: test_output.xml
 - name: testOutputFormat
  type: string
-  default: 'JUnit'
+  default: JUnit
  values:
    - JUnit
    - NUnit
@@ -32,31 +32,28 @@ parameters:
 - name: testPublishResults
  type: boolean
  default: true
- name: allowPartiallySucceededBuilds
+- name: allowComponentTestFailure
  type: object
  default:
    - amdsmi
-    - aomp
    - HIPIFY
-    - MIVisionX
    - rocm_smi_lib
-    - rocprofiler-sdk
    - roctracer
+    # the following do not use this template but allow test failures, included for completeness
+    - aomp
+    - ROCgdb

 steps:
 # run test, continue on failure to publish results
 # and to publish build artifacts
 - task: Bash@3
  displayName: '${{ parameters.componentName }} Test'
-  continueOnError: ${{ containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}
+  continueOnError: ${{ containsValue(parameters.allowComponentTestFailure, parameters.componentName) }}
  inputs:
    targetType: inline
-    ${{ if ne(parameters.os, 'almalinux8') }}:
-      script: ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
-    ${{ else }}:
-      script: |
-        source /opt/rh/gcc-toolset-14/enable
-        ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
    workingDirectory: ${{ parameters.testDir }}
 - ${{ if parameters.testPublishResults }}:
  - task: PublishTestResults@2
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -32,13 +32,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.0
+  value: 6.4.1
 - name: REPO_RADEON_VERSION
-  value: 6.4
+  value: 6.4.1
 - name: NEXT_RELEASE_VERSION
-  value: 6.5.0
+  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.0
+  value: rocm-6.4.1
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: AMDMIGRAPHX_PIPELINE_ID
@@ -66,11 +66,11 @@ variables:
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
 - name: HIPBLAS_COMMON_PIPELINE_ID
-  value: 223
+  value: 300
 - name: HIPBLAS_PIPELINE_ID
  value: 87
 - name: HIPBLASLT_PIPELINE_ID
-  value: 112
+  value: 301
 - name: HIPCUB_PIPELINE_ID
  value: 277
 - name: HIPFFT_PIPELINE_ID
@@ -80,7 +80,7 @@ variables:
 - name: HIPIFY_PIPELINE_ID
  value: 92
 - name: HIPRAND_PIPELINE_ID
-  value: 90
+  value: 275
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
 - name: HIPSPARSE_PIPELINE_ID
@@ -104,7 +104,7 @@ variables:
 - name: ROCALUTION_PIPELINE_ID
  value: 89
 - name: ROCBLAS_PIPELINE_ID
-  value: 85
+  value: 302
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
 - name: ROCDECODE_PIPELINE_ID
@@ -150,7 +150,7 @@ variables:
 - name: ROCR_RUNTIME_PIPELINE_ID
  value: 10
 - name: ROCRAND_PIPELINE_ID
-  value: 95
+  value: 274
 - name: ROCSOLVER_PIPELINE_ID
  value: 81
 - name: ROCSPARSE_PIPELINE_ID
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ different versions of the ROCm software stack and its components.

 ## ROCm 6.4.1

-See the [ROCm 6.4.1 release notes](https://rocm-stg.amd.com/en/latest/about/release-notes.html)
+See the [ROCm 6.4.1 release notes](https://rocm.docs.amd.com/en/docs-6.4.1/about/release-notes.html)
 for a complete overview of this release.

 ### **AMD SMI** (25.4.2)
--- a/README.md
+++ b/README.md
@@ -27,142 +27,9 @@ source software compilers, debuggers, and libraries. ROCm is fully integrated in
 > The instructions below describe the prior process for building from source
 > which will be replaced once TheRock is mature enough.

-## Getting the ROCm Source Code
+## Getting and Building ROCm from Source

-AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
-
-### Installing the repo tool
-
-The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
-
-```bash
-mkdir -p ~/bin/
-curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
-chmod a+x ~/bin/repo
-```
-
-**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
-
-### Installing git-lfs
-
-Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
-
-```bash
-sudo apt-get install git-lfs
-```
-
-### Downloading the ROCm source code
-
-The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
-
-```bash
-mkdir -p ~/ROCm/
-cd ~/ROCm/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-```
-
-**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
-
-## Building the ROCm source code
-
-Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
-
-Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
-
-## Build ROCm from source
-
-The Build will use as many processors as it can find to build in parallel. Some of the compiles can consume as much as 10GB of RAM, so make sure you have plenty of Swap Space !
-
-By default the ROCm build will compile for all supported GPU architectures and will take approximately 500 CPU hours.
-The Build time will reduce significantly if we limit the GPU Architecture/s against which we need to build by using the environment variable GPU_ARCHS as mentioned below.
-
-```bash
-# --------------------------------------
-# Step1: clone source code
-# --------------------------------------
-
-mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
-cd ~/WORKSPACE/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-
-# --------------------------------------
-# Step 2: Prepare build environment
-# --------------------------------------
-
-# Option 1: Start a docker container
-# Pulling required base docker images:
-# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
-docker pull rocm/rocm-build-ubuntu-22.04:6.4
-# Ubuntu24.04 built from ROCm/tools/rocm-build/docker/ubuntu24/Dockerfile
-docker pull rocm/rocm-build-ubuntu-24.04:6.4
-
-# Start docker container and mount the source code folder:
-docker run -ti \
-    -e ROCM_VERSION=${ROCM_VERSION} \
-    -e CCACHE_DIR=$HOME/.ccache \
-    -e CCACHE_ENABLED=true \
-    -e DOCK_WORK_FOLD=/src \
-    -w /src \
-    -v $PWD:/src \
-    -v /etc/passwd:/etc/passwd \
-    -v /etc/shadow:/etc/shadow \
-    -v ${HOME}/.ccache:${HOME}/.ccache \
-    -u $(id -u):$(id -g) \
-    <replace_with_required_ubuntu_base_docker_image> bash
-
-# Option 2: Install required packages into the host machine
-# For ubuntu22.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu22
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-# For ubuntu24.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu24
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-
-# --------------------------------------
-# Step 3: Run build command line
-# --------------------------------------
-
-# Select GPU targets before building:
-# When GPU_ARCHS is not set, default GPU targets supported by ROCm6.1 will be used.
-# To build against a subset of GFX architectures you can use the below env variable.
-# Support MI300 (gfx940, gfx941, gfx942).
-export GPU_ARCHS="gfx942"               # Example
-export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
-
-# Pick and run build commands in the docker container:
-# Build rocm-dev packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
-# Build all ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
-# list all ROCm components to find required components
-make -f ROCm/tools/rocm-build/ROCm.mk list_components
-# Build a single ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
-
-# Find built packages in ubuntu22.04:
-out/ubuntu-22.04/22.04/deb/
-# Find built packages in ubuntu24.04:
-out/ubuntu-24.04/24.04/deb/
-
-# Find built logs in ubuntu22.04:
-out/ubuntu-22.04/22.04/logs/
-# Find built logs in ubuntu24.04:
-out/ubuntu-24.04/24.04/logs/
-# All logs pertaining to failed components, end with .errrors extension.
-out/ubuntu-22.04/22.04/logs/rocblas.errors      # Example
-# All logs pertaining to building components, end with .inprogress extension.
-out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
-# All logs pertaining to passed components, use the component names.
-out/ubuntu-22.04/22.04/logs/rocblas             # Example
-```
-
-Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
+Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.

 ## ROCm documentation

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -654,4 +654,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime.
+`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html).
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -155,7 +155,7 @@ compatibility and system requirements.
 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
 .. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
 .. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.

@@ -235,6 +235,6 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
-:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
+:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

 MI300 and MI200 series performance counters
 ===============================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -71,8 +71,9 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
@@ -128,6 +129,7 @@ html_theme_options = {"link_main_doc": False}
 redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}

 numfig = False
+suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
@@ -0,0 +1,159 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
+      rocm_version: 6.3.1
+      vllm_version: 0.7.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+    - group: JAIS
+      tag: jais
+      models:
+      - model: JAIS 13B
+        mad_tag: pyt_vllm_jais-13b
+        model_repo: core42/jais-13b-chat
+        url: https://huggingface.co/core42/jais-13b-chat
+        precision: float16
+      - model: JAIS 30B
+        mad_tag: pyt_vllm_jais-30b
+        model_repo: core42/jais-30b-chat-v3
+        url: https://huggingface.co/core42/jais-30b-chat-v3
+        precision: float16
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
@@ -0,0 +1,152 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
+      rocm_version: 6.3.1
+      vllm_version: 0.8.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
@@ -0,0 +1,167 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
+      rocm_version: 6.3.1
+      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+        - model: Llama 3.1 8B
+          mad_tag: pyt_vllm_llama-3.1-8b
+          model_repo: meta-llama/Llama-3.1-8B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-8B
+          precision: float16
+        - model: Llama 3.1 70B
+          mad_tag: pyt_vllm_llama-3.1-70b
+          model_repo: meta-llama/Llama-3.1-70B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+          precision: float16
+        - model: Llama 3.1 405B
+          mad_tag: pyt_vllm_llama-3.1-405b
+          model_repo: meta-llama/Llama-3.1-405B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+          precision: float16
+        - model: Llama 3.2 11B Vision
+          mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+          model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+          precision: float16
+        - model: Llama 2 7B
+          mad_tag: pyt_vllm_llama-2-7b
+          model_repo: meta-llama/Llama-2-7b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+          precision: float16
+        - model: Llama 2 70B
+          mad_tag: pyt_vllm_llama-2-70b
+          model_repo: meta-llama/Llama-2-70b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+          precision: float16
+        - model: Llama 3.1 8B FP8
+          mad_tag: pyt_vllm_llama-3.1-8b_fp8
+          model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 70B FP8
+          mad_tag: pyt_vllm_llama-3.1-70b_fp8
+          model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 405B FP8
+          mad_tag: pyt_vllm_llama-3.1-405b_fp8
+          model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+          precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+        - model: Mixtral MoE 8x7B
+          mad_tag: pyt_vllm_mixtral-8x7b
+          model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+          precision: float16
+        - model: Mixtral MoE 8x22B
+          mad_tag: pyt_vllm_mixtral-8x22b
+          model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+          precision: float16
+        - model: Mistral 7B
+          mad_tag: pyt_vllm_mistral-7b
+          model_repo: mistralai/Mistral-7B-Instruct-v0.3
+          url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+          precision: float16
+        - model: Mixtral MoE 8x7B FP8
+          mad_tag: pyt_vllm_mixtral-8x7b_fp8
+          model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mixtral MoE 8x22B FP8
+          mad_tag: pyt_vllm_mixtral-8x22b_fp8
+          model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mistral 7B FP8
+          mad_tag: pyt_vllm_mistral-7b_fp8
+          model_repo: amd/Mistral-7B-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+          precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+        - model: Qwen2 7B
+          mad_tag: pyt_vllm_qwen2-7b
+          model_repo: Qwen/Qwen2-7B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+          precision: float16
+        - model: Qwen2 72B
+          mad_tag: pyt_vllm_qwen2-72b
+          model_repo: Qwen/Qwen2-72B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+          precision: float16
+        - model: QwQ-32B
+          mad_tag: pyt_vllm_qwq-32b
+          model_repo: Qwen/QwQ-32B
+          url: https://huggingface.co/Qwen/QwQ-32B
+          precision: float16
+          tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+        - model: DBRX Instruct
+          mad_tag: pyt_vllm_dbrx-instruct
+          model_repo: databricks/dbrx-instruct
+          url: https://huggingface.co/databricks/dbrx-instruct
+          precision: float16
+        - model: DBRX Instruct FP8
+          mad_tag: pyt_vllm_dbrx_fp8
+          model_repo: amd/dbrx-instruct-FP8-KV
+          url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+          precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+        - model: Gemma 2 27B
+          mad_tag: pyt_vllm_gemma-2-27b
+          model_repo: google/gemma-2-27b
+          url: https://huggingface.co/google/gemma-2-27b
+          precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+        - model: C4AI Command R+ 08-2024
+          mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+          model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+          url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+          precision: float16
+        - model: C4AI Command R+ 08-2024 FP8
+          mad_tag: pyt_vllm_command-r-plus_fp8
+          model_repo: amd/c4ai-command-r-plus-FP8-KV
+          url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+          precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+        - model: DeepSeek MoE 16B
+          mad_tag: pyt_vllm_deepseek-moe-16b-chat
+          model_repo: deepseek-ai/deepseek-moe-16b-chat
+          url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+          precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+        - model: Phi-4
+          mad_tag: pyt_vllm_phi-4
+          model_repo: microsoft/phi-4
+          url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+        - model: Falcon 180B
+          mad_tag: pyt_vllm_falcon-180b
+          model_repo: tiiuae/falcon-180B
+          url: https://huggingface.co/tiiuae/falcon-180B
+          precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
@@ -0,0 +1,162 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
+      rocm_version: 6.4.1
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -31,3 +31,11 @@ pytorch_inference_benchmark:
        model_repo: genmo/mochi-1-preview
        url: https://huggingface.co/genmo/mochi-1-preview
        precision: float16
+    - group: Wan2.1
+      tag: wan
+      models:
+      - model: Wan2.1
+        mad_tag: pyt_wan2.1_inference
+        model_repo: Wan-AI/Wan2.1-T2V-14B
+        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,10 +1,11 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
-      rocm_version: 6.3.1
-      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
@@ -26,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly :doc:`configure your system for AMD Instinct™ MI300X
-accelerators <../system-optimization/mi300x>`. They include detailed
-instructions on system settings and application :doc:`workload tuning
-<../rocm-for-ai/inference-optimization/workload>` to help you
-leverage the maximum capabilities of these accelerators and achieve superior
-performance.
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+They include detailed instructions on system settings and application
+:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
+help you leverage the maximum capabilities of these accelerators and achieve
+superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
  covers essential system settings and system management practices to configure
  your AMD Instinct MI300X system for performance.

-* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

-* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
+* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/fine-tuning/index.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/index.rst
@@ -24,5 +24,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
-
-
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -6,7 +6,7 @@
 Use ROCm for AI
 **************************

-ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
+ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.

 You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
 
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
 address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
-the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see :doc:`../inference/vllm-benchmark`.
+the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
+For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-profiling-tools:

@@ -343,9 +343,10 @@ The following performance tips are not *specific* to vLLM -- they are general
 but relevant in this context. You can tune the following vLLM parameters to
 achieve optimal request latency and throughput performance.

-* As described in :ref:`mi300x-env-vars`, the environment
-  variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
-  ``export HIP_FORCE_DEV_KERNARG=1``.
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.

 * Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
  to ``112`` to increase the number of channels on MI300X to potentially improve
@@ -410,9 +411,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
-ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
-see :doc:`../inference/vllm-benchmark`.
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-vllm-throughput-measurement:

@@ -1477,8 +1478,9 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
 checking whether the output is ``0``.

 If the output is ``1``, you can disable NUMA auto-balancing by running the
-following command: ``sudo sysctl kernel.numa_balancing=0``. For more
-details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
+see `AMD Instinct MI300X system optimization
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.

 .. _mi300x-rccl-disable-acs:

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -0,0 +1,346 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and
+   serving. It deploys the PagedAttention algorithm, which reduces memory
+   consumption and increases throughput by leveraging dynamic key and value
+   allocation in GPU memory. vLLM also incorporates many LLM acceleration
+   and quantization algorithms. In addition, AMD implements high-performance
+   custom kernels and modules in vLLM to enhance performance further. See
+   :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
+   information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``
+
+Although the following eight models are pre-configured to collect latency and
+throughput performance data, users can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Multiprocessing distributed executor
+--------------------------------------
+
+To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
+
+Command
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Latency benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the latency report at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+Throughput benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -0,0 +1,416 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. hlist::
+   :columns: 6
+
+   * Llama 3.1 8B
+
+   * Llama 3.1 70B
+
+   * Llama 3.1 405B
+
+   * Llama 2 7B
+
+   * Llama 2 70B
+
+   * Mixtral 8x7B
+
+   * Mixtral 8x22B
+
+   * Mixtral 7B
+
+   * Qwen2 7B
+
+   * Qwen2 72B
+
+   * JAIS 13B
+
+   * JAIS 30B
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_llama-2-70b``
+
+   * ``pyt_vllm_mixtral-8x7b``
+
+   * ``pyt_vllm_mixtral-8x22b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_qwen2-72b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+   * ``pyt_vllm_llama-3.1-8b_fp8``
+
+   * ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * ``pyt_vllm_mixtral-8x22b_fp8``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options
+-------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - Llama 2 70B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - Qwen2 72B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$model_repo``
+     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
+     - Llama 3.1 8B
+
+   * - (``float8``)
+     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
+     - Llama 3.1 70B
+
+   * -
+     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
+     - Llama 3.1 405B
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x7B
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x22B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -0,0 +1,461 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+***********************************************************
+LLM inference performance validation on AMD Instinct MI300X
+***********************************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment for validating large language model (LLM)
+inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
+Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
+accelerator and includes the following components:
+
+* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers for the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models. For more information, see the lists of
+:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
+and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+.. _vllm-benchmark-mad-models:
+
+Available models
+----------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 2, 3
+
+   * - Model name
+     - Tag
+
+   * - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+     - ``pyt_vllm_llama-3.1-8b``
+
+   * - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-70b``
+
+   * - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-405b``
+
+   * - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+     - ``pyt_vllm_llama-3.2-11b-vision-instruct``
+
+   * - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
+     - ``pyt_vllm_llama-2-7b``
+
+   * - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
+     - ``pyt_vllm_llama-2-70b``
+
+   * - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x7b``
+
+   * - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x22b``
+
+   * - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+     - ``pyt_vllm_mistral-7b``
+
+   * - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+     - ``pyt_vllm_qwen2-7b``
+
+   * - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+     - ``pyt_vllm_qwen2-72b``
+
+   * - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+     - ``pyt_vllm_jais-13b``
+
+   * - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+     - ``pyt_vllm_jais-30b``
+
+   * - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+     - ``pyt_vllm_dbrx-instruct``
+
+   * - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+     - ``pyt_vllm_gemma-2-27b``
+
+   * - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+     - ``pyt_vllm_c4ai-command-r-plus-08-2024``
+
+   * - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+     - ``pyt_vllm_deepseek-moe-16b-chat``
+
+   * - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x22b_fp8``
+
+   * - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mistral-7b_fp8``
+
+   * - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+     - ``pyt_vllm_dbrx_fp8``
+
+   * - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+     - ``pyt_vllm_command-r-plus_fp8``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options and available models
+----------------------------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Llama-3.1-8B-Instruct``
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+
+   * - (``float16``)
+     - ``meta-llama/Llama-3.1-70B-Instruct``
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.1-405B-Instruct``
+     - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.2-11B-Vision-Instruct``
+     - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+
+   * -
+     - ``core42/jais-13b-chat``
+     - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+
+   * -
+     - ``databricks/dbrx-instruct``
+     - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+
+   * -
+     - ``google/gemma-2-27b``
+     - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+
+   * -
+     - ``CohereForAI/c4ai-command-r-plus-08-2024``
+     - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+
+   * -
+     - ``deepseek-ai/deepseek-moe-16b-chat``
+     - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+
+   * - ``$model_repo``
+     - ``amd/Llama-3.1-70B-Instruct-FP8-KV``
+     - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+
+   * - (``float8``)
+     - ``amd/Llama-3.1-405B-Instruct-FP8-KV``
+     - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mistral-7B-v0.1-FP8-KV``
+     - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/dbrx-instruct-FP8-KV``
+     - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+
+   * -
+     - ``amd/c4ai-command-r-plus-FP8-KV``
+     - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -0,0 +1,329 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Available models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/25070a1841df0dca585b7ddcb967c42aaec4b7c5/docs/dev-docker>`__.
+
+   Getting started
+   ===============
+
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X accelerator with the prebuilt vLLM Docker image.
+
+   .. _vllm-benchmark-get-started:
+
+   1. Disable NUMA auto-balancing.
+
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+      .. code-block:: shell
+
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0
+
+   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -0,0 +1,345 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7a9f58aae0e7215a5f3dccde60e35072c41656c2/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
@@ -7,9 +9,15 @@
 vLLM inference performance testing
 **********************************

+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
 .. _vllm-benchmark-unified-docker:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}

@@ -101,18 +109,18 @@ vLLM inference performance testing
   page provides reference throughput and latency measurements for inferencing
   popular AI models.

-   .. note::
+   .. important::

      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/16d2b92ebcf90fe55cf73fa0b9329a6c9d3dede8/docs/dev-docker>`__.

   System validation
   =================
@@ -125,11 +133,13 @@ vLLM inference performance testing
   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

   .. code-block:: shell
+
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
+
   To test for optimal performance, consult the recommended :ref:`System health benchmarks
   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
   system's configuration.
@@ -141,7 +151,9 @@ vLLM inference performance testing
   Use the following command to pull the Docker image from Docker Hub.

   .. code-block:: shell
+
      docker pull {{ unified_docker.pull_tag }}
+
   Benchmarking
   ============

@@ -163,15 +175,19 @@ vLLM inference performance testing
            directory and install the required packages on the host machine.

            .. code-block:: shell
+
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
+
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
            using one GPU with the ``{{model.precision}}`` data type on the host machine.

            .. code-block:: shell
+
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
@@ -206,18 +222,24 @@ vLLM inference performance testing
            as shown in the following snippet.

            .. code-block::
+
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.

            .. code-block::
+
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
+
            To start the benchmark, use the following command with the appropriate options.

            .. code-block::
+
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
            .. list-table::
               :header-rows: 1
               :align: center
@@ -257,9 +279,12 @@ vLLM inference performance testing
               Face token to the gated models.

               .. code-block::
+
                  OSError: You are trying to access a gated repo.
+
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
+
            Here are some examples of running the benchmark with various options.

            * Latency benchmark
@@ -267,7 +292,9 @@ vLLM inference performance testing
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.

              .. code-block::
+
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.

            * Throughput benchmark
@@ -275,7 +302,9 @@ vLLM inference performance testing
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.

              .. code-block:: shell
+
                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.

            .. raw:: html
@@ -304,16 +333,22 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -0,0 +1,355 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
+
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -0,0 +1,82 @@
+:orphan:
+
+**************************************************
+vLLM inference performance testing version history
+**************************************************
+
+This table lists previous versions of the ROCm vLLM inference Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - ROCm version
+     - vLLM version
+     - PyTorch version
+     - Resources
+
+   * - 6.4.1
+     - 0.9.1
+     - 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`_
+
+   * - 6.4.1
+     - 0.9.0.1
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`_
+
+   * - 6.3.1
+     - 0.8.5 (0.8.6.dev)
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.5-20250521>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__
+
+   * - 6.3.1
+     - 0.8.5
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.5-20250513>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__
+
+   * - 6.3.1
+     - 0.8.3
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.8.3-20250415>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__
+
+   * - 6.3.1
+     - 0.7.3
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.7.3-20250325>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__
+
+   * - 6.3.1
+     - 0.6.6
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.6.6>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__
+
+   * - 6.2.1
+     - 0.6.4
+     - 2.5.0
+     - 
+       * :doc:`Documentation <vllm-0.6.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__
+
+   * - 6.2.0
+     - 0.4.3
+     - 2.4.0
+     - 
+       * :doc:`Documentation <vllm-0.4.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -32,10 +32,10 @@ PyTorch inference performance testing

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -99,21 +99,20 @@ vLLM inference performance testing

   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing
-   popular AI models.
+   page provides reference throughput and latency measurements for inferencing popular AI models.

-   .. note::
+   .. important::

      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.

   System validation
   =================
@@ -326,74 +325,22 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================

-This table lists previous versions of the ROCm vLLM inference Docker image for
-inference performance testing. For detailed information about available models
-for benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - ROCm version
-     - vLLM version
-     - PyTorch version
-     - Resources
-
-   * - 6.3.1
-     - 0.8.5
-     - 2.7.0
-     - 
-       * :doc:`Documentation <previous-versions/vllm-0.8.5-20250513>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
-
-   * - 6.3.1
-     - 0.8.3
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.4.0/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
-
-   * - 6.3.1
-     - 0.7.3
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
-
-   * - 6.3.1
-     - 0.6.6
-     - 2.7.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
-
-   * - 6.2.1
-     - 0.6.4
-     - 2.5.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
-
-   * - 6.2.0
-     - 0.4.3
-     - 2.4.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
+See :doc:`previous-versions/vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -14,14 +14,14 @@ Throughout the following topics, this section provides a comprehensive guide to
 The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
 training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.

- :doc:`Installing ROCm and machine learning frameworks <install>`
+- :doc:`Installing ROCm and machine learning frameworks <../install>`

 - :doc:`Running models from Hugging Face <hugging-face-models>`

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`vLLM inference performance testing <vllm-benchmark>`
+- :doc:`vLLM inference performance testing <benchmark-docker/vllm>`

- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`
+- :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -141,7 +141,7 @@ Installing vLLM

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
-   For more information, see :doc:`vllm-benchmark`.
+   For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _fine-tuning-llms-tgi:

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -28,7 +28,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
+* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`

 * :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -374,22 +374,5 @@ own cluster setup.
 Previous versions
 =================

-This table lists previous versions of the ROCm JAX MaxText Docker image for training
-performance testing. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - JAX version
-     - Resources
-
-   * - 25.4
-     - 6.3.0
-     - 0.4.31
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
+See :doc:`previous-versions/jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -119,12 +119,12 @@ To evaluate performance, the
 page provides reference throughput and latency measurements for training
 popular AI models.

-.. note::
+.. important::

   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this training benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -160,12 +160,14 @@ Download the Docker image
   .. tab-set:: 

      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312

         .. code-block:: shell

            docker pull rocm/megatron-lm:v25.5_py312

      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310

         .. code-block:: shell

@@ -173,9 +175,22 @@ Download the Docker image

 2. Launch the Docker container.

-   .. code-block:: shell
+   .. tab-set::

-      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.5
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
+
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

@@ -749,36 +764,5 @@ The benchmark tests support the following sets of variables.
 Previous versions
 =================

-This table lists previous versions of the ROCm Megatron-LM Docker image for training
-performance testing. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - PyTorch version
-     - Resources
-
-   * - 25.4
-     - 6.3.0
-     - 2.7.0a0+git637433 
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
-
-   * - 25.3
-     - 6.3.0
-     - 2.7.0a0+git637433 
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
-
-   * - 24.12-dev
-     - 6.1.0
-     - 2.4.0
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
+See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -0,0 +1,34 @@
+:orphan:
+
+********************************************************
+JAX MaxText training performance testing version history
+********************************************************
+
+This table lists previous versions of the ROCm JAX MaxText Docker image for training
+performance testing. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+You can find tagged
+previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/jax-training/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - JAX version
+     - Resources
+
+   * - 25.5
+     - 6.3.4
+     - 0.4.35
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`_
+
+   * - 25.4
+     - 6.3.0
+     - 0.4.31
+     - 
+       * :doc:`Documentation <jax-maxtext-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -0,0 +1,358 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.4``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.31                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+f81a3eb            |
+--------------------------+--------------------------------+
+| hipBLASLt                | git78ec8622                    |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change `localhost` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker:
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.4
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
+
+.. _amd-maxtext-get-started:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_70b.sh
+
+* Example 5: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -0,0 +1,47 @@
+:orphan:
+
+********************************************************
+Megatron-LM training performance testing version history
+********************************************************
+
+This table lists previous versions of the ROCm Megatron-LM training Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - PyTorch version
+     - Resources
+
+   * - v25.5
+     - 6.3.4
+     - 2.8.0a0+gite2f9759
+     - 
+       * :doc:`Documentation <../megatron-lm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`_
+
+   * - v25.4
+     - 6.3.0
+     - 2.7.0a0+git637433 
+     - 
+       * :doc:`Documentation <megatron-lm-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
+
+   * - v25.3
+     - 6.3.0
+     - 2.7.0a0+git637433 
+     - 
+       * :doc:`Documentation <megatron-lm-v25.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
+
+   * - v24.12-dev
+     - 6.1.0
+     - 2.4.0
+     - 
+       * :doc:`Documentation <megatron-lm-v24.12-dev>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -0,0 +1,516 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using ROCm Megatron-LM
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+**************************************
+Training a model with ROCm Megatron-LM
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+.. _amd-megatron-lm:
+
+The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
+enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
+accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
+efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
+components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
+following software to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.1                            |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.4.0                          |
+--------------------------+--------------------------------+
+| PyTorch Lightning        | 2.4.0                          |
+--------------------------+--------------------------------+
+| Megatron Core            | 0.9.0                          |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.5.0                          |
+--------------------------+--------------------------------+
+| Flash Attention          | v2.6                           |
+--------------------------+--------------------------------+
+| Transformers             | 4.44.0                         |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 2
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+Prerequisite system validation steps
+====================================
+
+Complete the following system validation and optimization steps to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+Hardware verification with ROCm
+-------------------------------
+
+Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
+instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
+GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
+You can restore this setting to its default value with the ``rocm-smi -r`` command.
+
+Run the command:
+
+.. code-block:: shell
+
+   rocm-smi --setperfdeterminism 1900
+
+See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
+
+RCCL Bandwidth Test
+-------------------
+
+ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
+routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
+pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
+for efficient distributed training.
+
+Running the RCCL bandwidth test helps verify that:
+
+- The GPUs can communicate across nodes or within a single node.
+
+- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
+  provides adequate bandwidth for communication.
+
+- No hardware setup or cabling issues could affect the communication between GPUs
+
+Tuning and optimizing hyperparameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In distributed training, specific hyperparameters related to distributed communication can be tuned based on
+the results of the RCCL bandwidth test. These variables are already set in the Docker image:
+
+.. code-block:: shell
+
+   # force all RCCL streams to be high priority
+   export TORCH_NCCL_HIGH_PRIORITY=1
+
+   # specify which RDMA interfaces to use for communication
+   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+   # define the Global ID index used in RoCE mode
+   export NCCL_IB_GID_INDEX=3
+
+   # avoid data corruption/mismatch issue that existed in past releases
+   export RCCL_MSCCL_ENABLE=0
+
+Running the RCCL Bandwidth Test
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It's recommended you run the RCCL bandwidth test before launching training. It ensures system
+performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
+image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
+See :ref:`mi300x-rccl` for more information.
+
+Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
+
+.. code-block:: shell
+
+   ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
+
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
+   :width: 800
+
+Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
+recommended. So, a run on 8 GPUs looks something like:
+
+.. code-block:: shell
+
+   mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
+
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
+   :width: 800
+
+Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
+for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
+PyTorch and TensorFlow.
+
+Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
+
+.. code-block::
+
+   /home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
+   --mca pml ucx \
+   --mca btl ^openib \
+   -x NCCL_SOCKET_IFNAME=ens50f0np0 \
+   -x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
+   -x NCCL_IB_GID_INDEX=3 \
+   -x NCCL_MIN_NCHANNELS=40 \
+   -x NCCL_DEBUG=version \
+   $HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
+
+.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
+   :width: 800
+
+.. _mi300x-amd-megatron-lm-training:
+
+Start training on MI300X accelerators
+=====================================
+
+The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+
+Download the Docker image and required packages
+-----------------------------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:24.12-dev
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
+
+3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/Megatron-LM
+      cd Megatron-LM
+
+   .. note::
+
+      This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
+      Checking out this specific commit is recommended for a stable and reproducible environment.
+
+      .. code-block:: shell
+         
+         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
+
+Prepare training datasets
+-------------------------
+
+If you already have the preprocessed data, you can skip this section.
+
+Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
+end-of-document token, remove sentence splitting, and use the tokenizer type.
+
+.. code-block:: shell
+
+   python tools/preprocess_data.py \
+       --input my-corpus.json \
+       --output-prefix my-gpt2 \
+       --vocab-file gpt2-vocab.json \
+       --tokenizer-type GPT2BPETokenizer \
+       --merge-file gpt2-merges.txt \
+       --append-eod
+
+In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
+``my-gpt2_text_document.idx``.
+
+.. image:: /data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
+   :width: 800
+
+.. _amd-megatron-lm-environment-setup:
+
+Environment setup
+-----------------
+
+In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
+``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
+``train_llama3.sh`` and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
+
+1. Run the following command to find the active network interface on your system.
+
+   .. code-block:: shell
+
+      ip a
+
+2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
+   example:
+
+   .. code-block:: shell
+
+      export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+You can use either mock data or real data for training.
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: shell
+
+     DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
+
+     DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
+
+  .. code-block:: shell
+
+     --data-path $DATA_PATH
+
+  Ensure that the files are accessible inside the Docker container.
+
+* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
+
+  .. code-block:: shell
+
+     --mock-data
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
+
+To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
+
+For example, if you're using the Llama 3.1 8B model:
+
+.. code-block:: shell
+
+   TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+Run benchmark tests
+-------------------
+
+.. note::
+
+   If you're running **multi node training**, update the following environment variables. They can
+   also be passed as command line arguments.
+
+   * Change ``localhost`` to the master node's hostname:
+
+     .. code-block:: shell
+
+        MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+     .. code-block:: shell
+
+        NNODES="${NNODES:-1}"
+
+   * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+     .. code-block:: shell
+
+        NODE_RANK="${NODE_RANK:-0}"
+
+* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
+
+  .. code-block:: shell
+
+     {variables} bash examples/llama/train_llama2.sh
+
+* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
+
+  .. code-block:: shell
+
+     {variables} bash examples/llama/train_llama3.sh
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the same set of variables:
+
+--------------------------+-----------------------+-----------------------+
+| Name                     | Options               | Description           |
+==========================+=======================+=======================+
+| ``TEE_OUTPUT``           | 0 or 1                | 0: disable training   |
+|                          |                       | log                   |
+|                          |                       |                       |
+|                          |                       | 1: enable training    |
+|                          |                       | log                   |
+--------------------------+-----------------------+-----------------------+
+| ``MBS``                  |                       | Micro batch size      |
+--------------------------+-----------------------+-----------------------+
+| ``BS``                   |                       | Batch size            |
+--------------------------+-----------------------+-----------------------+
+| ``TP``                   | 1, 2, 4, 8            | Tensor parallel       |
+--------------------------+-----------------------+-----------------------+
+| ``TE_FP8``               | 0 or 1                | Datatype.             |
+|                          |                       | If it is set to 1,    |
+|                          |                       | FP8.                  |
+|                          |                       |                       |
+|                          |                       | If it is set to 0.    |
+|                          |                       | BP16                  |
+--------------------------+-----------------------+-----------------------+
+| ``NO_TORCH_COMPILE``     | 0 or 1                | If it is set to 1,    |
+|                          |                       | enable torch.compile. |
+|                          |                       |                       |
+|                          |                       | If it is set to 0.    |
+|                          |                       | Disable torch.compile |
+|                          |                       | (default)             |
+--------------------------+-----------------------+-----------------------+
+| ``SEQ_LENGTH``           |                       | Input sequence length |
+--------------------------+-----------------------+-----------------------+
+| ``GEMM_TUNING``          | 0 or 1                | If it is set to 1,    |
+|                          |                       | enable gemm tuning.   |
+|                          |                       |                       |
+|                          |                       | If it is set to 0,    |
+|                          |                       | disable gemm tuning   |
+--------------------------+-----------------------+-----------------------+
+| ``USE_FLASH_ATTN``       | 0 or 1                | 0: disable flash      |
+|                          |                       | attention             |
+|                          |                       |                       |
+|                          |                       | 1: enable flash       |
+|                          |                       | attention             |
+--------------------------+-----------------------+-----------------------+
+| ``ENABLE_PROFILING``     | 0 or 1                | 0: disable torch      |
+|                          |                       | profiling             |
+|                          |                       |                       |
+|                          |                       | 1: enable torch       |
+|                          |                       | profiling             |
+--------------------------+-----------------------+-----------------------+
+| ``MODEL_SIZE``           |                       | The size of the mode: |
+|                          |                       | 7B/70B, etc.          |
+--------------------------+-----------------------+-----------------------+
+| ``TOTAL_ITERS``          |                       | Total number of       |
+|                          |                       | iterations            |
+--------------------------+-----------------------+-----------------------+
+| ``transformer-impl``     | transformer_engine or | Enable transformer    |
+|                          | local                 | engine by default     |
+--------------------------+-----------------------+-----------------------+
+
+Benchmarking examples
+^^^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Single node training
+      :sync: single
+
+      Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+      datatype, and so on.
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+      See the sample output:
+
+      .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+         :width: 800
+
+   .. tab-item:: Multi node training
+      :sync: multi
+
+      Launch the Docker container on each node.
+
+      In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+      so on.
+
+      On the master node:
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      On the worker node:
+
+      .. code-block:: bash
+
+         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+      Sample output for 2-node training:
+
+      Master node:
+
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+         :width: 800
+
+      Worker node:
+
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+         :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -0,0 +1,536 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
+designed to enable efficient training of large-scale language models on AMD
+GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
+enhanced scalability, performance, and resource utilization for AI workloads.
+It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
+DeepSeek, enabling developers to train next-generation AI models more
+efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+AMD provides a ready-to-use Docker image for MI300X accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+If you have already validated your system settings, skip this step. Otherwise,
+complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
+to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+.. _mi300x-amd-megatron-lm-training:
+
+Environment setup
+=================
+
+The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:v25.3
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
+
+.. _amd-megatron-lm-environment-setup:
+
+Configuration scripts
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
+      script in the ``examples/llama`` directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
+      Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
+      the configuration script accordingly.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
+      directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
+      and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To avoid connectivity issues in multi-node deployments, ensure the correct network interface
+      is set in your training scripts.
+
+      1. Run the following command (outside the container) to find the active network interface on your system.
+
+         .. code-block:: shell
+
+            ip a
+
+      2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
+         example:
+
+         .. code-block:: shell
+
+            export NCCL_SOCKET_IFNAME=ens50f0np0
+
+            export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"}  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      If you don't already have the dataset, download the DeepSeek dataset using the following
+      commands:
+
+      .. code-block:: shell
+
+         mkdir deepseek-datasets
+         cd deepseek-datasets
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+
+      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
+
+      For example, if you're using the Llama 3.1 8B model:
+
+      .. code-block:: shell
+
+         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're running multi-node training, update the following environment variables. They can
+      also be passed as command line arguments.
+
+      * Change ``localhost`` to the master node's hostname:
+
+        .. code-block:: shell
+
+           MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+        .. code-block:: shell
+
+           NNODES="${NNODES:-1}"
+
+      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+        .. code-block:: shell
+
+           NODE_RANK="${NODE_RANK:-0}"
+
+      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+        NFS directory) for multi-node runs:
+
+        .. code-block:: shell
+
+           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+        inside a Docker, either install the drivers inside the Docker container or pass the network
+        drivers from the host while creating the Docker container.
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD Megatron-LM Docker image.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            To run training on a single node, navigate to the Megatron-LM folder and use the
+            following command:
+
+            .. code-block:: shell
+
+               TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
+
+            * On the master node ``NODE0``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+            * On the worker node ``NODE1``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
+
+      .. code-block:: shell
+
+         cd /workspace/Megatron-LM
+         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
+
+Key options
+-----------
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the following sets of variables:
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      ``TEE_OUTPUT``
+        ``1`` to enable training logs or ``0`` to disable.
+
+      ``TE_FP8``
+        ``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``USE_FLASH_ATTN``
+        ``1`` to enable Flash Attention.
+
+      ``ENABLE_PROFILING``
+        ``1`` to enable PyTorch profiling for performance analysis.
+
+      ``transformer-impl``
+        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+      ``MODEL_SIZE``
+        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data provided by you.
+
+      ``MBS``
+        Micro batch size.
+
+      ``BS``
+        Global batch size.
+
+      ``TP``
+        Tensor parallel (``1``, ``2``, ``4``, ``8``).
+
+      ``SEQ_LENGTH``
+        Input sequence length.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      ``PR``
+        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data provided by you.
+
+      ``MBS``
+        Micro batch size.
+
+      ``GBS``
+        Global batch size.
+
+Benchmarking examples
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+            datatype, and so on.
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            See the sample output:
+
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+               :width: 800
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            Launch the Docker container on each node.
+
+            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+            so on.
+
+            On the master node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            On the worker node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            Sample output for 2-node training:
+
+            Master node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+               :width: 800
+
+            Worker node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+               :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -0,0 +1,618 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
+designed to enable efficient training of large-scale language models on AMD
+GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
+enhanced scalability, performance, and resource utilization for AI workloads.
+It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
+DeepSeek, enabling developers to train next-generation AI models more
+efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
+
+AMD provides a ready-to-use Docker image for MI300X series accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`_.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+.. _mi300x-amd-megatron-lm-training:
+
+Environment setup
+=================
+
+The prebuilt ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/megatron-lm:v25.4
+
+2. Launch the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.4
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
+(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
+
+.. _amd-megatron-lm-environment-setup:
+
+Configuration scripts
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
+      script in the ``examples/llama`` directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__.
+      Likewise, if you're working with Llama 3 or Llama 3.1, use ``train_llama3.sh`` and update
+      the configuration script accordingly.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
+      directory of
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__
+      and update the configuration script accordingly.
+
+Network interface
+^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface that has an IP address in the same subnet as
+      your other nodes. Then, update the following variables in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+         export GLOO_SOCKET_IFNAME=ens50f0np0
+
+Dataset options
+^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+        To download the dataset, set the ``DATASET`` variable to the dataset you'd like to use. Two datasets are supported: ``DATASET=wiki`` and ``DATASET=bookcorpus``.
+        Use the following command to download the dataset.
+
+        .. code-block:: shell
+
+           DATASET=wiki bash examples/llama/prepare_dataset.sh # For wiki-en dataset
+           DATASET=bookcorpus bash examples/llama/prepare_dataset.sh # For bookcorpus dataset
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      If you don't already have the dataset, download the DeepSeek dataset using the following
+      commands:
+
+      .. code-block:: shell
+
+         mkdir deepseek-datasets
+         cd deepseek-datasets
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+      You can use either mock data or real data for training.
+
+      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+        value is ``1`` for enabled.
+
+        .. code-block:: bash
+
+           MOCK_DATA=1
+
+      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+        .. code-block:: bash
+
+           MOCK_DATA=0
+
+           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
+
+        Ensure that the files are accessible inside the Docker container.
+
+Tokenizer
+^^^^^^^^^
+
+Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
+models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
+a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
+fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
+handle a variety of input sequences, including unseen words or domain-specific terms.
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded to the default tokenizer model path: ``${DATA_DIR}/tokenizer_llama3``
+or ``${DATA_DIR}/tokenizer_llama2``.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      or the default ``HuggingFaceTokenizer``.
+
+      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
+      Set the Hugging Face model path in the ``TOKENIZER_MODEL`` variable.
+
+      For example, if you're using the Llama 3.1 8B model:
+
+      .. code-block:: shell
+
+         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
+
+      .. note::
+
+         If you don't already have the Llama 3.1 tokenizer locally, set your
+         personal Hugging Face access token ``HF_TOKEN`` to download the
+         tokenizer. If you encounter the following error, set ``HF_TOKEN`` to
+         your access-authorized Hugging Face token.
+
+         .. code-block:: shell
+
+            OSError: You are trying to access a gated repo.
+
+            # pass your HF_TOKEN
+            export HF_TOKEN=$your_personal_hf_token
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      If you're running multi-node training, update the following environment variables. They can
+      also be passed as command line arguments.
+
+      * Change ``localhost`` to the master node's hostname:
+
+        .. code-block:: shell
+
+           MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+        .. code-block:: shell
+
+           NNODES="${NNODES:-1}"
+
+      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+        .. code-block:: shell
+
+           NODE_RANK="${NODE_RANK:-0}"
+
+      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+        NFS directory) for multi-node runs:
+
+        .. code-block:: shell
+
+           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+        inside a Docker container, either install the drivers inside the Docker container or pass the network
+        drivers from the host while creating the Docker container.
+
+        .. code-block:: shell
+
+           # Specify which RDMA interfaces to use for communication
+           export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD Megatron-LM Docker image.
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            To run training on a single node, navigate to the Megatron-LM folder and use one of the
+            following commands.
+
+            - For Llama 3.1 8B FP8:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 3.1 8B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 7B FP8:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            - For Llama 2 7B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            To run training with FSDP2 enabled, add the ``FSDP=1`` argument. For example:
+
+            - For Llama 3 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            .. note::
+
+               It's suggested to use ``TP=1`` when FSDP is enabled for higher throughput. FSDP2 is not supported with pipeline parallelism,
+               expert parallelism, MCore's distributed optimizer, gradient accumulation fusion, and ``FP16`` precision.
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
+
+            * On the master node ``NODE0``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+            * On the worker node ``NODE1``:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
+
+      .. code-block:: shell
+
+         cd /workspace/Megatron-LM
+         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
+
+Key options
+-----------
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+The benchmark tests support the following sets of variables:
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      ``TEE_OUTPUT``
+        ``1`` to enable training logs or ``0`` to disable.
+
+      ``TE_FP8``
+        ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``USE_FLASH_ATTN``
+        ``1`` to enable Flash Attention.
+
+      ``FSDP``
+        ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+        ``--overlap-param-gather``, and ``--sequence-parallel`` are automaticallyu disabled.
+
+      ``ENABLE_PROFILING``
+        ``1`` to enable PyTorch profiling for performance analysis.
+
+      ``transformer-impl``
+        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+      ``MODEL_SIZE``
+        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
+
+      ``TOTAL_ITERS``
+        The total number of iterations -- ``10`` by default.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data you provide.
+
+      ``MBS``
+        Micro batch size.
+
+      ``BS``
+        Global batch size.
+
+      ``TP``
+        Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+      ``SEQ_LENGTH``
+        Input sequence length.
+
+   .. tab-item:: DeepSeek V2
+      :sync: deepseek
+
+      ``PR``
+        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+      ``GEMM_TUNING``
+        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+      ``TRAIN_ITERS``
+        The total number of iterations.
+
+      ``MOCK_DATA``
+        ``1`` to use mock data or ``0`` to use real data you provide.
+
+      ``MBS``
+        Micro batch size.
+
+      ``GBS``
+        Global batch size.
+
+      ``SEQ_LEN``
+        Input sequence length.
+
+      ``AC``
+        Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+Benchmarking examples
+---------------------
+
+.. tab-set::
+
+   .. tab-item:: Llama
+      :sync: llama
+
+      .. tab-set::
+
+         .. tab-item:: Single node training
+            :sync: single-node
+
+            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
+            datatype, and so on.
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            See the sample output:
+
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+               :width: 800
+
+         .. tab-item:: Multi-node training
+            :sync: multi-node
+
+            Launch the Docker container on each node.
+
+            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
+            so on.
+
+            On the master node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            On the worker node:
+
+            .. code-block:: bash
+
+               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
+               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
+
+            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
+
+            Sample output for 2-node training:
+
+            Master node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
+               :width: 800
+
+            Worker node:
+
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
+               :width: 800
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -0,0 +1,47 @@
+:orphan:
+
+****************************************************
+PyTorch training performance testing version history
+****************************************************
+
+This table lists previous versions of the ROCm Megatron-LM training Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation. You can find tagged
+previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`_.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - PyTorch version
+     - Resources
+
+   * - v25.6
+     - 6.3.4
+     - 2.8.0a0+git7d205b2
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+
+   * - v25.5
+     - 6.3.4
+     - 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.5>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+
+   * - v25.4
+     - 6.3.0
+     - 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.4>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
+
+   * - v25.3
+     - 6.3.0
+     - 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.3>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch
+   training performance documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+If you have already validated your system settings, skip this step. Otherwise,
+complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
+to set up your system before starting training.
+
+Disable NUMA auto-balancing
+---------------------------
+
+Generally, application performance can benefit from disabling NUMA auto-balancing. However,
+it might be detrimental to performance with certain types of workloads.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
+Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
+the output is ``1``, run the following command to disable NUMA auto-balancing.
+
+.. code-block:: shell
+
+   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/pytorch-training:v25.3
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
+
+3. Use these commands if you exit the ``training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start training_env
+      docker exec -it training_env bash
+
+4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd MAD/scripts/pytorch-train
+
+Prepare training datasets and dependencies
+------------------------------------------
+
+The following benchmarking examples may require downloading models and datasets
+from Hugging Face. To ensure successful access to gated repos, set your
+``HF_TOKEN``.
+
+Run the setup script to install libraries and datasets needed for benchmarking.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_setup.sh
+
+``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Library
+     - Benchmark model
+     - Reference
+
+   * - ``accelerate``
+     - Llama 3.1 8B, FLUX
+     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+   * - ``datasets``
+     - Llama 3.1 8B, 70B, FLUX
+     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+   * - ``torchdata``
+     - Llama 3.1 70B
+     - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+   * - ``tomli``
+     - Llama 3.1 70B
+     - `Tomli <https://pypi.org/project/tomli/>`_
+
+   * - ``tiktoken``
+     - Llama 3.1 70B
+     - `tiktoken <https://github.com/openai/tiktoken>`_
+
+   * - ``blobfile``
+     - Llama 3.1 70B
+     - `blobfile <https://pypi.org/project/blobfile/>`_
+
+   * - ``tabulate``
+     - Llama 3.1 70B
+     - `tabulate <https://pypi.org/project/tabulate/>`_
+
+   * - ``wandb``
+     - Llama 3.1 70B
+     - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+   * - ``sentencepiece``
+     - Llama 3.1 70B, FLUX
+     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+   * - ``tensorboard``
+     - Llama 3.1 70 B, FLUX
+     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+   * - ``csvkit``
+     - FLUX
+     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+   * - ``deepspeed``
+     - FLUX
+     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+   * - ``diffusers``
+     - FLUX
+     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+   * - ``GitPython``
+     - FLUX
+     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+   * - ``opencv-python-headless``
+     - FLUX
+     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+   * - ``peft``
+     - FLUX
+     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+   * - ``protobuf``
+     - FLUX
+     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+   * - ``pytest``
+     - FLUX
+     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+   * - ``python-dotenv``
+     - FLUX
+     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+   * - ``seaborn``
+     - FLUX
+     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+   * - ``transformers``
+     - FLUX
+     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Along with the following datasets:
+
+* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+Start training on AMD Instinct accelerators
+===========================================
+
+The prebuilt PyTorch with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI300X series
+accelerators with the AMD PyTorch training Docker image.
+
+Once your environment is set up, use the following commands and examples to start benchmarking.
+
+Pretraining
+-----------
+
+To start the pretraining benchmark, use the following command with the
+appropriate options. See the following list of options and their descriptions.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+Options and available models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$training_mode``
+     - ``pretrain``
+     - Benchmark pretraining
+
+   * -
+     - ``finetune_fw``
+     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``finetune_lora``
+     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+   * - ``$datatype``
+     - FP8 or BF16
+     - Only Llama 3.1 8B supports FP8 precision.
+
+   * - ``$model_repo``
+     - Llama-3.1-8B
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+   * - 
+     - Llama-3.1-70B
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * - 
+     - Flux
+     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Fine-tuning
+-----------
+
+To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
+with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+Benchmarking examples
+---------------------
+
+Here are some examples of how to use the command.
+
+* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
@@ -0,0 +1,397 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch
+   training performance documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.4``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.11                           |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git258a2162                    |
+--------------------------+--------------------------------+
+| Triton                   | 3.1                            |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 2 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/pytorch-training:v25.4
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.4
+
+3. Use these commands if you exit the ``training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start training_env
+      docker exec -it training_env bash
+
+4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+   repository and navigate to the benchmark scripts directory
+   ``/workspace/MAD/scripts/pytorch_train``.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd MAD/scripts/pytorch_train
+
+Prepare training datasets and dependencies
+------------------------------------------
+
+The following benchmarking examples require downloading models and datasets
+from Hugging Face. To ensure successful access to gated repos, set your
+``HF_TOKEN``.
+
+.. code-block:: shell
+
+   export HF_TOKEN=$your_personal_hugging_face_access_token
+
+Run the setup script to install libraries and datasets needed for benchmarking.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_setup.sh
+
+``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Library
+     - Benchmark model
+     - Reference
+
+   * - ``accelerate``
+     - Llama 3.1 8B, FLUX
+     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+   * - ``datasets``
+     - Llama 3.1 8B, 70B, FLUX
+     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+   * - ``torchdata``
+     - Llama 3.1 70B
+     - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+   * - ``tomli``
+     - Llama 3.1 70B
+     - `Tomli <https://pypi.org/project/tomli/>`_
+
+   * - ``tiktoken``
+     - Llama 3.1 70B
+     - `tiktoken <https://github.com/openai/tiktoken>`_
+
+   * - ``blobfile``
+     - Llama 3.1 70B
+     - `blobfile <https://pypi.org/project/blobfile/>`_
+
+   * - ``tabulate``
+     - Llama 3.1 70B
+     - `tabulate <https://pypi.org/project/tabulate/>`_
+
+   * - ``wandb``
+     - Llama 3.1 70B
+     - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+   * - ``sentencepiece``
+     - Llama 3.1 70B, FLUX
+     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+   * - ``tensorboard``
+     - Llama 3.1 70 B, FLUX
+     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+   * - ``csvkit``
+     - FLUX
+     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+   * - ``deepspeed``
+     - FLUX
+     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+   * - ``diffusers``
+     - FLUX
+     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+   * - ``GitPython``
+     - FLUX
+     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+   * - ``opencv-python-headless``
+     - FLUX
+     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+   * - ``peft``
+     - FLUX
+     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+   * - ``protobuf``
+     - FLUX
+     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+   * - ``pytest``
+     - FLUX
+     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+   * - ``python-dotenv``
+     - FLUX
+     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+   * - ``seaborn``
+     - FLUX
+     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+   * - ``transformers``
+     - FLUX
+     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+Along with the following datasets:
+
+* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
+* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+Getting started
+===============
+
+The prebuilt PyTorch with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama 3.1 and Llama 2. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+Use the following instructions to set up the environment, configure the script
+to train models, and reproduce the benchmark results on MI325X and MI300X
+accelerators with the AMD PyTorch training Docker image.
+
+Once your environment is set up, use the following commands and examples to start benchmarking.
+
+Pretraining
+-----------
+
+To start the pretraining benchmark, use the following command with the
+appropriate options. See the following list of options and their descriptions.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+Options and available models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$training_mode``
+     - ``pretrain``
+     - Benchmark pretraining
+
+   * -
+     - ``finetune_fw``
+     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``finetune_lora``
+     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+   * -
+     - ``HF_finetune_lora``
+     - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
+   * - ``$datatype``
+     - ``FP8`` or ``BF16``
+     - Only Llama 3.1 8B supports FP8 precision.
+
+   * - ``$model_repo``
+     - ``Llama-3.1-8B``
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+   * - 
+     - ``Llama-3.1-70B``
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * - 
+     - ``Llama-2-70B``
+     - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+   * - 
+     - ``Flux``
+     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+   * - ``$sequence_length``
+     - Sequence length for the language model.
+     - Between 2048 and 8192. 8192 by default.
+
+.. note::
+
+   Occasionally, downloading the Flux dataset might fail. In the event of this
+   error, manually download it from Hugging Face at
+   `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+   and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+   the required dataset.
+
+Fine-tuning
+-----------
+
+To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+Benchmarking examples
+---------------------
+
+Here are some examples of how to use the command.
+
+* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+* Example 6: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -0,0 +1,439 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+(``rocm/pytorch-training:v25.5``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+25a33da            |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git53b53bf                     |
+--------------------------+--------------------------------+
+| Triton                   | 3.2.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 2 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Benchmarking
+============
+
+Once the setup is complete, choose between two options to start benchmarking:
+
+.. tab-set::
+
+   .. tab-item:: MAD-integrated benchmarking
+
+      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+      directory and install the required packages on the host machine.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD
+         pip install -r requirements.txt
+
+      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
+      using one GPU with the float16 data type on the host machine.
+
+      .. code-block:: shell
+
+         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+      The available models for MAD-integrated benchmarking are:
+
+      * ``pyt_train_llama-3.3-70b``
+
+      * ``pyt_train_llama-3.1-8b``
+
+      * ``pyt_train_llama-3.1-70b``
+
+      * ``pyt_train_flux``
+
+      MAD launches a Docker container with the name
+      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
+      model are collected in the following path: ``~/MAD/perf.csv``.
+
+   .. tab-item:: Standalone benchmarking
+
+      .. rubric:: Download the Docker image and required packages
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull rocm/pytorch-training:v25.5
+
+      Run the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+
+      Use these commands if you exit the ``training_env`` container and need to return to it.
+
+      .. code-block:: shell
+
+         docker start training_env
+         docker exec -it training_env bash
+
+      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+      repository and navigate to the benchmark scripts directory
+      ``/workspace/MAD/scripts/pytorch_train``.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD/scripts/pytorch_train
+
+      .. rubric:: Prepare training datasets and dependencies
+
+      The following benchmarking examples require downloading models and datasets
+      from Hugging Face. To ensure successful access to gated repos, set your
+      ``HF_TOKEN``.
+
+      .. code-block:: shell
+
+         export HF_TOKEN=$your_personal_hugging_face_access_token
+
+      Run the setup script to install libraries and datasets needed for benchmarking.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_setup.sh
+
+      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Library
+           - Benchmark model
+           - Reference
+
+         * - ``accelerate``
+           - Llama 3.1 8B, FLUX
+           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+         * - ``datasets``
+           - Llama 3.1 8B, 70B, FLUX
+           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         * - ``torchdata``
+           - Llama 3.1 70B
+           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+         * - ``tomli``
+           - Llama 3.1 70B
+           - `Tomli <https://pypi.org/project/tomli/>`_
+
+         * - ``tiktoken``
+           - Llama 3.1 70B
+           - `tiktoken <https://github.com/openai/tiktoken>`_
+
+         * - ``blobfile``
+           - Llama 3.1 70B
+           - `blobfile <https://pypi.org/project/blobfile/>`_
+
+         * - ``tabulate``
+           - Llama 3.1 70B
+           - `tabulate <https://pypi.org/project/tabulate/>`_
+
+         * - ``wandb``
+           - Llama 3.1 70B
+           - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+         * - ``sentencepiece``
+           - Llama 3.1 70B, FLUX
+           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+         * - ``tensorboard``
+           - Llama 3.1 70 B, FLUX
+           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         * - ``csvkit``
+           - FLUX
+           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+         * - ``deepspeed``
+           - FLUX
+           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+         * - ``diffusers``
+           - FLUX
+           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+         * - ``GitPython``
+           - FLUX
+           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+         * - ``opencv-python-headless``
+           - FLUX
+           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+         * - ``peft``
+           - FLUX
+           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+         * - ``protobuf``
+           - FLUX
+           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+         * - ``pytest``
+           - FLUX
+           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+         * - ``python-dotenv``
+           - FLUX
+           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+         * - ``seaborn``
+           - FLUX
+           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+         * - ``transformers``
+           - FLUX
+           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+      Along with the following datasets:
+
+      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
+      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+      .. rubric:: Pretraining
+
+      To start the pretraining benchmark, use the following command with the
+      appropriate options. See the following list of options and their descriptions.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Name
+           - Options
+           - Description
+
+         * - ``$training_mode``
+           - ``pretrain``
+           - Benchmark pretraining
+
+         * -
+           - ``finetune_fw``
+           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``finetune_lora``
+           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``HF_finetune_lora``
+           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
+         * - ``$datatype``
+           - ``FP8`` or ``BF16``
+           - Only Llama 3.1 8B supports FP8 precision.
+
+         * - ``$model_repo``
+           - ``Llama-3.3-70B``
+           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-8B``
+           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-70B``
+           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+         * - 
+           - ``Llama-2-70B``
+           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+         * - 
+           - ``Flux``
+           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+         * - ``$sequence_length``
+           - Sequence length for the language model.
+           - Between 2048 and 8192. 8192 by default.
+
+      .. note::
+
+         Occasionally, downloading the Flux dataset might fail. In the event of this
+         error, manually download it from Hugging Face at
+         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+         the required dataset.
+
+      .. rubric:: Fine-tuning
+
+      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+      .. rubric:: Benchmarking examples
+
+      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+
+      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
+
+      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
+
+      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
+
+      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,27 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-(``rocm/pytorch-training:v25.5``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
+The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:

 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.4                          |
 +--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
+| PyTorch                  | 2.8.0a0+git7d205b2             |
 +--------------------------+--------------------------------+
-| Python                   | 3.10                           |
+| Python                   | 3.10.17                        |
 +--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+25a33da            |
+| Transformer Engine       | 1.14.0+2f85f5f2                |
 +--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
+| Flash Attention          | 3.0.0.post1                    |
 +--------------------------+--------------------------------+
-| hipBLASLt                | git53b53bf                     |
+| hipBLASLt                | 0.15.0-8c6919d                 |
 +--------------------------+--------------------------------+
-| Triton                   | 3.2.0                          |
+| Triton                   | 3.3.0                          |
 +--------------------------+--------------------------------+

 .. _amd-pytorch-training-model-support:
@@ -40,422 +39,396 @@ Supported models

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.

-* Llama 3.3 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-* Llama 3.1 8B
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}

-* Llama 3.1 70B
+   .. raw:: html

-* Llama 2 70B
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>

-* FLUX.1-dev
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>

-.. note::
+   .. note::

-   Only these models are supported in the following steps.
+      Some models require an external license agreement through a third party (for example, Meta).

-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
+   .. _amd-pytorch-training-performance-measurements:

-.. _amd-pytorch-training-performance-measurements:
+   Performance measurements
+   ========================

-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
+   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
+   page provides reference throughput and latency measurements for training
+   popular AI models.

-System validation
-=================
+   .. note::

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.

-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
+   System validation
+   =================

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.

-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.

-Benchmarking
-============
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

-Once the setup is complete, choose between two options to start benchmarking:
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.

-.. tab-set::
+   Benchmarking
+   ============

-   .. tab-item:: MAD-integrated benchmarking
+   Once the setup is complete, choose between two options to start benchmarking:

-      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-      directory and install the required packages on the host machine.
+   .. tab-set::

-      .. code-block:: shell
+      .. tab-item:: MAD-integrated benchmarking

-         git clone https://github.com/ROCm/MAD
-         cd MAD
-         pip install -r requirements.txt
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.

-      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
-      using one GPU with the float16 data type on the host machine.
+         .. code-block:: shell

-      .. code-block:: shell
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt

-         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}

-      The available models for MAD-integrated benchmarking are:
+         .. container:: model-doc {{ model.mad_tag }}

-      * ``pyt_train_llama-3.3-70b``
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.

-      * ``pyt_train_llama-3.1-8b``
+            .. code-block:: shell

-      * ``pyt_train_llama-3.1-70b``
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800

-      * ``pyt_train_flux``
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.

-      MAD launches a Docker container with the name
-      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
-      model are collected in the following path: ``~/MAD/perf.csv``.
+      {% endfor %}
+   {% endfor %}

-   .. tab-item:: Standalone benchmarking
+      .. tab-item:: Standalone benchmarking

-      .. rubric:: Download the Docker image and required packages
+         .. rubric:: Download the Docker image and required packages

-      Use the following command to pull the Docker image from Docker Hub.
+         Use the following command to pull the Docker image from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch-training:v25.5
+            docker pull {{ unified_docker.pull_tag }}

-      Run the Docker container.
+         Run the Docker container.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}

-      Use these commands if you exit the ``training_env`` container and need to return to it.
+         Use these commands if you exit the ``training_env`` container and need to return to it.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker start training_env
-         docker exec -it training_env bash
+            docker start training_env
+            docker exec -it training_env bash

-      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-      repository and navigate to the benchmark scripts directory
-      ``/workspace/MAD/scripts/pytorch_train``.
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.

-      .. code-block:: shell
+         .. code-block:: shell

-         git clone https://github.com/ROCm/MAD
-         cd MAD/scripts/pytorch_train
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train

-      .. rubric:: Prepare training datasets and dependencies
+         .. rubric:: Prepare training datasets and dependencies

-      The following benchmarking examples require downloading models and datasets
-      from Hugging Face. To ensure successful access to gated repos, set your
-      ``HF_TOKEN``.
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.

-      .. code-block:: shell
+         .. code-block:: shell

-         export HF_TOKEN=$your_personal_hugging_face_access_token
+            export HF_TOKEN=$your_personal_hugging_face_access_token

-      Run the setup script to install libraries and datasets needed for benchmarking.
+         Run the setup script to install libraries and datasets needed for benchmarking.

-      .. code-block:: shell
+         .. code-block:: shell

-         ./pytorch_benchmark_setup.sh
+            ./pytorch_benchmark_setup.sh

-      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+         .. container:: model-doc pyt_train_llama-3.1-8b

-      .. list-table::
-         :header-rows: 1
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-         * - Library
-           - Benchmark model
-           - Reference
+            .. list-table::
+               :header-rows: 1

-         * - ``accelerate``
-           - Llama 3.1 8B, FLUX
-           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+               * - Library
+                 - Reference

-         * - ``datasets``
-           - Llama 3.1 8B, 70B, FLUX
-           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-         * - ``torchdata``
-           - Llama 3.1 70B
-           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``tomli``
-           - Llama 3.1 70B
-           - `Tomli <https://pypi.org/project/tomli/>`_
+         .. container:: model-doc pyt_train_llama-3.1-70b

-         * - ``tiktoken``
-           - Llama 3.1 70B
-           - `tiktoken <https://github.com/openai/tiktoken>`_
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-         * - ``blobfile``
-           - Llama 3.1 70B
-           - `blobfile <https://pypi.org/project/blobfile/>`_
+            .. list-table::
+               :header-rows: 1

-         * - ``tabulate``
-           - Llama 3.1 70B
-           - `tabulate <https://pypi.org/project/tabulate/>`_
+               * - Library
+                 - Reference

-         * - ``wandb``
-           - Llama 3.1 70B
-           - `Weights & Biases <https://github.com/wandb/wandb>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``sentencepiece``
-           - Llama 3.1 70B, FLUX
-           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_

-         * - ``tensorboard``
-           - Llama 3.1 70 B, FLUX
-           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_

-         * - ``csvkit``
-           - FLUX
-           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_

-         * - ``deepspeed``
-           - FLUX
-           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_

-         * - ``diffusers``
-           - FLUX
-           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_

-         * - ``GitPython``
-           - FLUX
-           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_

-         * - ``opencv-python-headless``
-           - FLUX
-           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-         * - ``peft``
-           - FLUX
-           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         * - ``protobuf``
-           - FLUX
-           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+         .. container:: model-doc pyt_train_flux

-         * - ``pytest``
-           - FLUX
-           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-         * - ``python-dotenv``
-           - FLUX
-           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+            .. list-table::
+               :header-rows: 1

-         * - ``seaborn``
-           - FLUX
-           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+               * - Library
+                 - Reference

-         * - ``transformers``
-           - FLUX
-           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-      Along with the following datasets:
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-      .. rubric:: Pretraining
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-      To start the pretraining benchmark, use the following command with the
-      appropriate options. See the following list of options and their descriptions.
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-      .. code-block:: shell
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-      .. list-table::
-         :header-rows: 1
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-         * - Name
-           - Options
-           - Description
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-         * - ``$training_mode``
-           - ``pretrain``
-           - Benchmark pretraining
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         * -
-           - ``finetune_fw``
-           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * -
-           - ``finetune_lora``
-           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

-         * -
-           - ``HF_finetune_lora``
-           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}

-         * - ``$datatype``
-           - ``FP8`` or ``BF16``
-           - Only Llama 3.1 8B supports FP8 precision.
+         .. container:: model-doc {{ model.mad_tag }}

-         * - ``$model_repo``
-           - ``Llama-3.3-70B``
-           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+            .. rubric:: Pretraining

-         * - 
-           - ``Llama-3.1-8B``
-           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-         * - 
-           - ``Llama-3.1-70B``
-           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+            .. code-block:: shell

-         * - 
-           - ``Llama-2-70B``
-           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length

-         * - 
-           - ``Flux``
-           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+            .. list-table::
+               :header-rows: 1

-         * - ``$sequence_length``
-           - Sequence length for the language model.
-           - Between 2048 and 8192. 8192 by default.
+               * - Name
+                 - Options
+                 - Description

-      .. note::
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}

-         Occasionally, downloading the Flux dataset might fail. In the event of this
-         error, manually download it from Hugging Face at
-         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-         the required dataset.
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.

-      .. rubric:: Fine-tuning
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}

-      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
-      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+               .. note::

-      .. code-block:: shell
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}

-         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}

-      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
-      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+            .. rubric:: Fine-tuning

-      .. code-block:: shell
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+            .. code-block:: shell

-      .. rubric:: Benchmarking examples
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

-      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+            .. list-table::
+               :header-rows: 1

-      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+               * - Name
+                 - Options
+                 - Description

-        .. code-block:: shell
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)

-      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)

-        .. code-block:: shell
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT

-           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.

-      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.

-        .. code-block:: shell
+            .. note::

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+               {{ model.model }} currently supports the following fine-tuning methods:

-      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}

-        .. code-block:: shell
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}

-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+               .. rubric:: Benchmarking examples

-      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
-
-      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
-
-      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
-
-      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
-
-      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

 Previous versions
 =================

-This table lists previous versions of the ROCm PyTorch training Docker image for training
-performance validation. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - PyTorch version
-     - Resources
-
-   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
-
-   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
+See :doc:`previous-versions/pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-hpc/index.rst
+++ b/docs/how-to/rocm-for-hpc/index.rst
@@ -76,14 +76,6 @@ Ubuntu versions.
          single node workstations, multi and many-core nodes, clusters of nodes via
          QMP, and classic vector computers.

-      * -
-        - `Grid <https://github.com/amd/InfinityHub-CI/tree/main/grid/>`_
-        - Grid is a library for lattice QCD calculations that employs a high-level data parallel
-          approach while using a number of techniques to target multiple types of parallelism.
-          The library currently supports MPI, OpenMP and short vector parallelism. The SIMD
-          instructions sets covered include SSE, AVX, AVX2, FMA4, IMCI and AVX512. Recent
-          releases expanded this support to include GPU offloading.
-
      * -
        - `MILC <https://github.com/amd/InfinityHub-CI/tree/main/milc/>`_
        - The MILC Code is a set of research codes developed by MIMD Lattice Computation
@@ -237,12 +229,18 @@ Ubuntu versions.
          of these applications.

      * - Tools and libraries
-        - `ROCm with GPU-aware MPI container <https://github.com/amd/InfinityHub-CI/tree/main/base-gpu-mpi-rocm-docker>`_
+        - `AMD ROCm with OpenMPI container <https://github.com/amd/InfinityHub-CI/tree/main/base-gpu-mpi-rocm-docker>`_
        - Base container for GPU-aware MPI with ROCm for HPC applications. This
          project provides a boilerplate for building and running a Docker
          container with ROCm supporting GPU-aware MPI implementations using
          OpenMPI or UCX.
-
+      
+      * - 
+        - `AMD ROCm with MPICH container <https://github.com/amd/InfinityHub-CI/tree/main/base-mpich-rocm-docker>`_
+        - Base container for GPU-aware MPI with ROCm for HPC applications. This
+          project provides a boilerplate for building and running a Docker
+          container with ROCm supporting GPU-aware MPI implementations using MPICH.
+      
      * -
        - `Kokkos <https://github.com/amd/InfinityHub-CI/tree/main/kokkos>`_
        - Kokkos is a programming model in C++ for writing performance portable
--- a/docs/how-to/setting-cus.rst
+++ b/docs/how-to/setting-cus.rst
@@ -38,5 +38,5 @@ The variable parsing stops when a syntax error occurs. The erroneous set and the

    These environment variables only affect ROCm software, not graphics applications.

-Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP. For more information about what to expect when disabling CUs, see the `Exploring AMD GPU Scheduling Details by Experimenting With “Worst Practices” <https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ paper.
+Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP.

--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -12,8 +12,7 @@ accelerators. They include detailed instructions on system settings and
 application tuning suggestions to help you fully leverage the capabilities of
 these accelerators, thereby achieving optimal performance.

-* :doc:`../../rocm-for-ai/inference/vllm-benchmark`
-* :doc:`../../rocm-for-ai/inference-optimization/workload`
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload`
 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_


--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -215,9 +215,9 @@ sphinx==8.1.3
    #   sphinx-copybutton
    #   sphinx-design
    #   sphinx-external-toc
+    #   sphinx-last-updated-by-git
    #   sphinx-notfound-page
    #   sphinx-reredirects
-    #   sphinx-sitemap
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
 sphinx-book-theme==1.1.4
@@ -228,11 +228,13 @@ sphinx-design==0.6.1
    # via rocm-docs-core
 sphinx-external-toc==1.0.1
    # via rocm-docs-core
+sphinx-last-updated-by-git==0.3.8
+    # via sphinx-sitemap
 sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
-sphinx-sitemap==2.6.0
+sphinx-sitemap==2.7.2
    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
@@ -282,7 +284,7 @@ typing-extensions==4.14.0
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.4.0
+urllib3==2.5.0
    # via
    #   pygithub
    #   requests
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -98,7 +98,7 @@ System Management
 .. csv-table::
  :header: "Component", "Description"

-  ":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
+  ":doc:`AMD SMI <amdsmi:index>`", "System management interface to control AMD GPU settings, monitor performance, and retrieve device and process information"
  ":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
  ":doc:`rocminfo <rocminfo:index>`", "Reports system information"
  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
Author	SHA1	Message	Date
Daniel Su	393df3e05c	[Ex CI] hipSPARSELt monorepo enablement (#5033 )	2025-07-11 16:40:18 -04:00
Daniel Su	aa3cdcb3c3	[Ex CI] increase hipSPARSELt test timeout (#5028 )	2025-07-10 12:04:06 -04:00
Pratik Basyal	e8bb027c20	HIP 7.0 upcoming changes blog link updated (#5021 )	2025-07-10 09:53:44 -04:00
Pratik Basyal	544186aef8	ROCm for HPC table update for Develop (#5015 ) (#5016 ) (#5019 ) * ROCm for HPC table update for 6.4.0 (#5015) (#5016) * 6.4.0 updates synced * Minor change * Link update	2025-07-09 14:57:53 -04:00
Peter Park	22524eeaa5	fix xrefs in vllm-0.9.0.1-20250605.rst (#5017 )	2025-07-09 14:38:24 -04:00
Peter Park	d471b04cd5	Update vLLM Docker doc for 07/02	2025-07-09 11:38:27 -04:00
Di Nguyen	1c7cff8a47	Merge pull request #5011 from ROCm/zenguyen/disable-device-merge-inplace-rocprim [rocPRIM] Disable device_merge_inplace unit test for rocPRIM	2025-07-09 09:12:08 -06:00
Daniel Su	84c664074f	[Ex CI] add OS to copyHIP filenames (#5012 )	2025-07-09 10:37:23 -04:00
NguyenNhuDi	7c6083d840	disabled device_merge_inplace	2025-07-08 14:08:53 -06:00
Daniel Su	94099b1398	[Ex CI] rocPyDecode: fix test running (#5002 )	2025-07-08 14:32:30 -04:00
Peter Park	3b3fc4894b	Fix xrefs and Sphinx warnings in documentation Fix xrefs and Sphinx warnings in documentation	2025-07-08 13:22:53 -04:00
Daniel Su	8aba1d2318	[Ex CI] fix printed artifact download links (#4998 )	2025-07-04 14:41:33 -04:00
Mirza Halilčević	e9e75cfc46	Merge pull request #4963 from ROCm/pybind11 Add pybind11 as a pip module requirement for azure	2025-07-04 13:35:24 +02:00
Peter Park	58b3ad0509	Fix Docker run commands in Megatron-LM Docker doc (#4996 ) * fix megatron-lm docker run commands * update --shm-size option	2025-07-02 14:19:27 -04:00
Daniel Su	523d8520f3	[Ex CI] rocBLAS: increase test timeout to 2 hours (#4995 )	2025-07-02 12:16:50 -04:00
Peter Park	d0c8ba0805	Add Wan2.1 to PyTorch inference Docker documentation (#4984 ) * add wan2.1 to pyt inference models * update group name * fix container tag * fix group name * change documented data type to bfloat16 * fix col width	2025-07-02 09:58:37 -04:00
ammallya	73de8a3e46	Removing failing checkout step	2025-07-01 11:25:17 -07:00
Daniel Su	1fc312f90f	[Ex CI] fix hardcoded gfx in MIOpen CK script (#4993 )	2025-06-30 15:34:54 -04:00
Daniel Su	fde2647ccd	[Ex CI] migrate rocBLAS to monorepo (#4987 )	2025-06-30 15:16:58 -04:00
Daniel Su	798c8debb5	[Ex CI] consolidate artifact extraction and deletion in deps-rocm (#4961 )	2025-06-30 14:12:52 -04:00
dependabot[bot]	393ba600c2	Build(deps): Bump sphinx-sitemap from 2.6.0 to 2.7.2 in /docs/sphinx (#4985 ) Bumps [sphinx-sitemap](https://github.com/jdillard/sphinx-sitemap) from 2.6.0 to 2.7.2. - [Release notes](https://github.com/jdillard/sphinx-sitemap/releases) - [Changelog](https://github.com/jdillard/sphinx-sitemap/blob/master/CHANGELOG.rst) - [Commits](https://github.com/jdillard/sphinx-sitemap/compare/v2.6.0...v2.7.2) --- updated-dependencies: - dependency-name: sphinx-sitemap dependency-version: 2.7.2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-06-30 09:33:28 -06:00
Daniel Su	c64c545b52	[Ex CI] hipBLASLt: build some archs on medium pool (#4986 )	2025-06-30 11:32:35 -04:00
Daniel Su	76ee1d720f	[Ex CI] rocAL: switch to medium pool (#4983 )	2025-06-27 13:41:07 -04:00
Daniel Su	5adc040367	[Ex CI] migrate hipBLAS-common & hipBLASLt pipeline IDs (#4982 )	2025-06-27 12:09:58 -04:00
Daniel Su	061da8f306	[Ex CI] enable almalinux8 and gfx1100 builds for hipBLASLt, rocBLAS, rocSOLVER (#4955 )	2025-06-27 10:39:30 -04:00
Daniel Su	e26767bca6	[Ex CI] Tensile: add boost filesystem (#4980 )	2025-06-27 10:38:31 -04:00
Daniel Su	7b6f1800d4	[Ex CI] fix miopen-get-ck for new artifact naming scheme (#4979 )	2025-06-26 15:49:13 -04:00
Pratik Basyal	a6221937f2	KMD UMD support footnote update ROCm 640 (#4973 ) (#4976 ) * KMD UMD support footnote update ROCm 640 * Histotical footnote	2025-06-26 15:34:21 -04:00
Daniel Su	ac2df2961d	[Ex CI] add component name to artifact download filter (#4974 )	2025-06-26 13:55:03 -04:00
Mirza Halilcevic	9b102061f4	Add pybind11 as a pip module requirement for azure.	2025-06-24 08:06:52 -05:00
Daniel Su	f20e8dec8b	[Ex CI] revert PRIM default branch to develop (#4960 )	2025-06-23 16:35:02 -04:00
Daniel Su	10e9157f39	[Ex CI] allow rerun jobs to upload artifacts (#4959 )	2025-06-23 15:37:52 -04:00
Daniel Su	a2ce6021cb	[Ex CI] add more OSs to nightly build (#4958 )	2025-06-23 15:13:11 -04:00
Peter Park	2196fc9a2f	Fix pytorch training 25.6 doc (#4956 ) * fix pytorch-training history * fix pytorch-training fix	2025-06-23 13:45:50 -04:00
Daniel Su	925689f89e	[Ex CI] enable gfx1100 builds (#4954 )	2025-06-23 11:26:35 -04:00
Peter Park	91a541f8b9	Update PyTorch training benchmark doc for v25.6 (#4950 ) * update pytorch-training docker details * add previous version * add models data * update models data id * add models picker * update data * update fmt fmt * update data yaml * update template * update data * fix * fix vllm-0.6.4 broken link * fix vllm history	2025-06-23 09:26:15 -04:00
Peter Park	34f8d57ece	Organize version histories in ROCm for AI benchmark Docker docs (#4948 ) * add vllm 0.8.3 20250415 update prev versions table * add vllm previous versions page * move index to vllm-history * add standalone megatron-lm version history * add pytorch training version history * fix * add vllm-0.4.3 * add vllm-0.6.4 * update vllm-history * add vllm-0.7.3 * add vllm-0.6.6 * add notes * fix vllm readme links fix main page link * add latest version to previous versions list * add jax-maxtext history * fix jax-maxtext history * add pytorch-training history * add link in jax-maxtext 25.4 * add megatron-lm history * fix datatemplate path for vllm 0.8.3 * fix jax-maxtext history link * update note about performance measurements * add vllm 0.8.5_20250521 previous version * consistency fixes	2025-06-20 15:01:38 -04:00
yugang-amd	55f95adc7c	Update for vllm -06/10 (#4943 )	2025-06-20 08:41:37 -04:00
Daniel Su	e05b1702d8	[Ex CI] fix experimental HIP to CLR triggers (#4946 )	2025-06-19 12:56:53 -04:00
Daniel Su	4179042cf7	[Ex CI] add multi-OS support to copyHIP (#4945 )	2025-06-19 12:15:22 -04:00
dependabot[bot]	ae2de81b79	Build(deps): Bump urllib3 from 2.4.0 to 2.5.0 in /docs/sphinx (#4942 ) Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.4.0 to 2.5.0. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.4.0...2.5.0) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.5.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-06-19 09:03:29 -06:00
Daniel Su	efd6cec4a4	[Ex CI] disable downstream triggers for mathlibs not yet migrated (#4936 )	2025-06-18 14:10:58 -04:00
Daniel Su	b65996587f	[Ex CI] remove ALLOWED_PARTIAL_SUCCEED_BUILDS library variable (#4937 )	2025-06-18 12:10:04 -04:00
yugang-amd	7b7eaf69f2	remove broken xref (#4939 )	2025-06-18 10:15:53 -04:00
Daniel Su	4cfc8ddad2	[Ex CI] MIVisionX: add hipBLASLt to build deps (#4931 )	2025-06-17 13:40:35 -04:00
Daniel Su	97ebbb227d	[Ex CI] rocprof-sdk: add cmake, libsqlite3-dev (#4935 )	2025-06-17 13:40:15 -04:00
Daniel Su	8c6a1726fe	[Ex CI] remove old aqlprofile param in Pytorch (#4927 )	2025-06-16 15:17:23 -04:00
Daniel Su	2656143c9e	[Ex CI] fix ROCm versions (#4930 )	2025-06-16 11:42:51 -04:00
Daniel Su	7910841c94	[Ex CI] rccl: use vendored gtest, use GPU_TARGETS flag (#4929 )	2025-06-16 11:35:20 -04:00
Daniel Su	30fec8f74a	[Ex CI] update ROCm versioning (#4928 )	2025-06-16 11:31:19 -04:00
Daniel Su	1923f801e0	[Ex CI] fix hipRAND multi-OS tests, Tensile sparse dir (#4923 )	2025-06-13 16:21:13 -04:00
Peter Park	d69037bfcc	Fix Sphinx issue in vllm-benchmark 0.8.5-20250513 previous version (#4924 ) * fix sphinx issue in vllm-benchmark 0.8.5-20250513 previous version * update article_info in conf.py * update rocm/vllm	2025-06-13 15:03:51 -04:00
Daniel Su	7ac6aa4084	[Ex CI] add OS support to monorepo downstream triggers (#4920 )	2025-06-13 12:26:05 -04:00
Daniel Su	14f3c42320	[Ex CI] Tensile almalinux8 builds (#4915 )	2025-06-12 16:43:55 -04:00
Daniel Su	67be6f6249	[Ex CI] migrate roc/hipRAND pipelines, change migrated mathlibs's default branch to rocm-rel-7.0 (#4918 ) * [Ex CI] migrate roc/hipRAND pipeline IDs to monorepo * [Ex CI] change migrated mathlibs's default branch to rocm-rel-7.0	2025-06-12 15:39:41 -04:00
powderluv	2502fc5bcf	Update README.md to point to TheRock (#4907 ) * Update README.md to point to TheRock Point to TheRock build system to build ROCm * Update README.md --------- Co-authored-by: David Galiffi <dgaliffi@amd.com> Co-authored-by: alexxu-amd <159800977+alexxu-amd@users.noreply.github.com>	2025-06-12 10:44:34 -04:00
Pratik Basyal	61c6749a10	Link to 6.4.1 updated from internal to public (#4913 )	2025-06-10 16:59:52 -04:00
Daniel Su	8e8104c811	[Ex CI] add new rocprof-compute pip packages (#4905 )	2025-06-10 16:06:51 -04:00