Add protobuf build to AMDMIGraphX

add abseil dependency to migraphx
[Ex CI] change rocprofiler's branch to develop (#5208 )
2026-01-10 07:08:08 -05:00 · 2025-09-04 02:17:40 -05:00 · 2025-08-19 17:19:34 -05:00 · 2025-08-19 15:44:07 -04:00 · 2025-08-19 15:22:02 -04:00 · 2025-08-18 10:00:10 -04:00
18 changed files with 859 additions and 239 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -210,6 +210,21 @@ jobs:
        gpuTarget: ${{ job.target }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - task: Bash@3
+      displayName: 'git clone protobuf'
+      inputs:
+        targetType: inline
+        script: git clone -b "v30.0" https://github.com/protocolbuffers/protobuf --recurse-submodules
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        cmakeBuildDir: $(Agent.BuildDirectory)/protobuf/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/protobuf
+        extraBuildFlags: >-
+          -DCMAKE_POSITION_INDEPENDENT_CODE=On
+          -Dprotobuf_BUILD_TESTS=Off
+          -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+          -DCMAKE_BUILD_TYPE=Release
    - task: CMake@1
      displayName: MIGraphXTest CMake Flags
      inputs:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -158,6 +158,7 @@ jobs:
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
+          -DBUILD_USE_LOCAL_TENSILE=OFF
          -GNinja
        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -70,6 +86,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,6 +114,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
@@ -108,6 +129,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -115,6 +138,7 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
@@ -122,10 +146,13 @@ jobs:
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -139,63 +166,68 @@ jobs:
            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
            - ROCM_PATH:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    - name: LD_LIBRARY_PATH
-      value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofilerV1
-        testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
-        testExecutable:  ./run.sh
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofilerV2
-        testDir: $(Agent.BuildDirectory)/rocm
-        testExecutable:  share/rocprofiler/tests/runUnitTests
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: ROCM_PATH
+        value: $(Agent.BuildDirectory)/rocm
+      - name: LD_LIBRARY_PATH
+        value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocprofilerV1
+          testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
+          testExecutable:  ./run.sh
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocprofilerV2
+          testDir: $(Agent.BuildDirectory)/rocm
+          testExecutable:  share/rocprofiler/tests/runUnitTests
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -203,8 +203,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler:
-      pipelineId: 143
-      developBranch: amd-staging
+      pipelineId: 329
+      developBranch: develop
      hasGpuTarget: true
    rocprofiler-compute:
      pipelineId: 257
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -28,13 +28,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: REPO_RADEON_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: NEXT_RELEASE_VERSION
  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.2
+  value: rocm-6.4.3
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -5,6 +5,7 @@ ACEs
 ACS
 AccVGPR
 AccVGPRs
+AITER
 ALU
 AllReduce
 AMD
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -31,9 +31,9 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -242,7 +242,9 @@ Expand for full historical view of:
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -9,17 +9,21 @@ import shutil
 import sys
 from pathlib import Path

-shutil.copy2("../RELEASE.md", "./about/release-notes.md")
-shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
+gh_release_path = os.path.join("..", "RELEASE.md")
+gh_changelog_path = os.path.join("..", "CHANGELOG.md")
+sphinx_release_path = os.path.join("about", "release-notes.md")
+sphinx_changelog_path = os.path.join("release", "changelog.md")
+shutil.copy2(gh_release_path, sphinx_release_path)
+shutil.copy2(gh_changelog_path, sphinx_changelog_path)

 # Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
-with open("./release/changelog.md", "r+") as file:
+with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
    content = file.read()
    file.seek(0)
    file.write(":orphan:\n" + content)

 # Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
-with open("./release/changelog.md", "r") as file:
+with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

    modified_lines = []
@@ -57,11 +61,14 @@ with open("./release/changelog.md", "r") as file:

    file.close()

-    with open("./release/changelog.md", 'w') as file:
+    with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
        file.writelines(modified_lines)

-os.system("mkdir -p ../_readthedocs/html/downloads")
-os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
+matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
+rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
+if not os.path.exists(rtd_path):
+    os.makedirs(rtd_path)
+shutil.copy2(matrix_path, rtd_path)

 latex_engine = "xelatex"
 latex_elements = {
@@ -147,6 +154,8 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -28,13 +28,31 @@ See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/doc

 Use the Python Virtual Environment (`venv`) and run the following commands from the project root:

+::::{tab-set}
+:::{tab-item} Linux and WSL
+:sync: linux
+
 ```sh
 python3 -mvenv .venv

-.venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
-.venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
+.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
 ```

+:::
+:::{tab-item} Windows
+:sync: windows
+
+```powershell
+python -mvenv .venv
+
+.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
+.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+```
+
+:::
+::::
+
 Navigate to `_build/html/index.html` and open this file in a web browser.

 ## Visual Studio Code
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
@@ -0,0 +1,163 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -2,11 +2,11 @@ vllm_benchmark:
  unified_docker:
    latest:
      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
@@ -27,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
@@ -65,11 +60,6 @@ vllm_benchmark:
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
@@ -80,72 +70,15 @@ vllm_benchmark:
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
    - group: Qwen
      tag: qwen
      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
@@ -153,11 +86,3 @@ vllm_benchmark:
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-      - model: Falcon 180B
-        mad_tag: pyt_vllm_falcon-180b
-        model_repo: tiiuae/falcon-180B
-        url: https://huggingface.co/tiiuae/falcon-180B
-        precision: float16
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -14,7 +14,7 @@ vLLM inference performance testing
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-702:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml

@@ -77,7 +77,7 @@ vLLM inference performance testing
        </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -159,7 +159,7 @@ vLLM inference performance testing
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -0,0 +1,450 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-715:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
+  This parameter has been removed from the benchmarking script.
+
+* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
+  This parameter has been removed from the benchmarking script.
+
+* Fixed a ``+rms_norm`` custom kernel issue.
+
+* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
+
+* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-715:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-715:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and latency measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 1 or 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``float16`` or ``float8``
+                       - Data type
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block::
+
+                  ./vllm_benchmark_report.sh \
+                      -s $test_option \
+                      -m {{model.model_repo}} \
+                      -g $num_gpu \
+                      -d {{model.precision}}
+
+               .. note::
+
+                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Known issues and workarounds
+============================
+
+AITER does not support FP8 KV cache yet.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
       (latest)
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.10.0
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
     - 
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -7,7 +7,7 @@
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-812:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

@@ -47,17 +47,11 @@ What's new

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
-  This parameter has been removed from the benchmarking script.
+* Upgraded to vLLM v0.10.

-* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
-  This parameter has been removed from the benchmarking script.
+* FP8 KV cache support via AITER.

-* Fixed a ``+rms_norm`` custom kernel issue.
-
-* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
-
-* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+* Full graph capture support via AITER.

 Supported models
 ================
@@ -67,7 +61,7 @@ Supported models
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}

-   .. _vllm-benchmark-available-models:
+   .. _vllm-benchmark-available-models-812:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -102,7 +96,7 @@ Supported models
      </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -124,14 +118,14 @@ Supported models
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.

-.. _vllm-benchmark-performance-measurements:
+.. _vllm-benchmark-performance-measurements-812:

 Performance measurements
 ========================

 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and latency measurements for inferencing popular AI models.
+page provides reference throughput and serving measurements for inferencing popular AI models.

 .. important::

@@ -176,7 +170,7 @@ system's configuration.
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -209,12 +203,15 @@ system's configuration.
                      --timeout 28800

            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
+            Although the :ref:`available models
+            <vllm-benchmark-available-models>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.

            {% if model.tunableop %}

@@ -224,14 +221,12 @@ system's configuration.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.

-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
-               To enable it, include the ``--tunableop on`` argument in your
-               run.
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed
-               by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.

            {% endif %}

@@ -269,6 +264,13 @@ system's configuration.

            3. To start the benchmark, use the following command with the appropriate options.

+               .. code-block::
+
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>
+
               .. dropdown:: Benchmark options
                  :open:

@@ -280,42 +282,40 @@ system's configuration.
                       - Options
                       - Description

-                     * - ``$test_option``
-                       - latency
-                       - Measure decoding token latency
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.

                     * -
-                       - throughput
-                       - Measure token generation throughput
+                       - ``configs/extended.csv``
+                       - 

                     * -
-                       - all
-                       - Measure both throughput and latency
+                       - ``configs/performance.csv``
+                       - 

-                     * - ``$num_gpu``
-                       - 1 or 8
-                       - Number of GPUs
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.

-                     * - ``$datatype``
-                       - ``float16`` or ``float8``
-                       - Data type
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.

                  The input sequence length, output sequence length, and tensor parallel (TP) are
                  already configured. You don't need to specify them with this script.

-               Command:
-
-               .. code-block::
-
-                  ./vllm_benchmark_report.sh \
-                      -s $test_option \
-                      -m {{model.model_repo}} \
-                      -g $num_gpu \
-                      -d {{model.precision}}
-
               .. note::

-                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.

                  If you encounter the following error, pass your access-authorized Hugging
                  Face token to the gated models.
@@ -331,33 +331,33 @@ system's configuration.

            Here are some examples of running the benchmark with various options:

-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh \
-                     -s latency \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
            * Throughput benchmark

              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.

              .. code-block:: shell

-                 ./vllm_benchmark_report.sh \
-                     -s throughput \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput

-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.

            .. raw:: html

@@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
   .. code-block:: shell

      cd vllm
-      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978

 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

@@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:

      docker build -f docker/Dockerfile.rocm -t vllm-rocm .

-Known issues and workarounds
-============================
-
-AITER does not support FP8 KV cache yet.
-
 Further reading
 ===============

--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - Radeon AI PRO R9700
          - RDNA4
          - gfx1201
-          - 16
+          - 32
          - 64
          - 32 or 64
          - 128
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -234,7 +234,7 @@ sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
-sphinx-sitemap==2.7.2
+sphinx-sitemap==2.8.0
    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
Author	SHA1	Message	Date
Breanna Devore-McDonald	54ceaff4d4	Add protobuf build to AMDMIGraphX	2025-09-04 02:17:40 -05:00
Breanna Devore-McDonald	89b8578ece	add abseil dependency to migraphx	2025-08-19 17:19:34 -05:00
Daniel Su	00b0d9430e	[Ex CI] change rocprofiler's branch to develop (#5208 )	2025-08-19 15:44:07 -04:00
Daniel Su	14acec6000	[Ex CI] switch rocprofiler pipeline ID (#5207 )	2025-08-19 15:22:02 -04:00
Peter Park	c154b7e0a3	Fix documented VRAM for Radeon AI Pro R9700 (#5203 )	2025-08-18 10:00:10 -04:00
David Dixon	9f5cd4500c	Don't use local tensilelite (#5201 )	2025-08-18 06:19:27 -06:00
Jan Stephan	51e7d9550f	Make documentation build platform-independent (#5052 ) Make documentation build platform-independent	2025-08-18 10:59:31 +02:00
Peter Park	55d0a88ec5	vLLM inference benchmark doc: add missing data field (#5199 )	2025-08-15 13:20:39 -04:00
Peter Park	7ee22790ce	docs: Update vLLM benchmark doc for 20250812 Docker release (#5196 )	2025-08-14 15:43:36 -04:00
Daniel Su	ec05312de7	[Ex CI] enable rocprofiler monorepo (#5197 ) * [Ex CI] enable rocprofiler monorepo * set ROCM_PATH	2025-08-14 14:31:34 -04:00
amd-hsivasun	39e7ccd3c5	Update variables-global.yml	2025-08-13 17:27:05 -04:00
dependabot[bot]	c4135ab541	Bump sphinx-sitemap from 2.7.2 to 2.8.0 in /docs/sphinx (#5192 ) Bumps [sphinx-sitemap](https://github.com/jdillard/sphinx-sitemap) from 2.7.2 to 2.8.0. - [Release notes](https://github.com/jdillard/sphinx-sitemap/releases) - [Changelog](https://github.com/jdillard/sphinx-sitemap/blob/master/CHANGELOG.rst) - [Commits](https://github.com/jdillard/sphinx-sitemap/compare/v2.7.2...v2.8.0) --- updated-dependencies: - dependency-name: sphinx-sitemap dependency-version: 2.8.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-08-13 09:22:31 -06:00
anisha-amd	dd56fd4d3a	develop: compatibility matrix frameworks support update (#5185 )	2025-08-12 14:25:37 -04:00
Jan Stephan	3c71bb25e8	Make initial directory and copy operations platform-independent	2025-07-16 15:13:13 +02:00