mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-20 20:18:15 -05:00
Compare commits
8 Commits
deep-710
...
amd/jayhaw
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7ec34dee52 | ||
|
|
674dc355e4 | ||
|
|
c7f3a56811 | ||
|
|
0107fa731e | ||
|
|
a87ec360e1 | ||
|
|
7215e1e8c7 | ||
|
|
e4a59d8c66 | ||
|
|
8108fe7275 |
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocWMMA
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -66,7 +85,11 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocWMMA_build_${{ job.target }}
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -81,6 +104,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
@@ -102,9 +126,12 @@ jobs:
|
||||
# gfx1030 not supported in documentation
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -112,43 +139,45 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocWMMA_test_${{ job.target }}
|
||||
timeoutInMinutes: 270
|
||||
dependsOn: rocWMMA_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocWMMA
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
timeoutInMinutes: 270
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -21,11 +21,25 @@ parameters:
|
||||
- libtbb-dev
|
||||
- libtiff-dev
|
||||
- libva-amdgpu-dev
|
||||
- libva2-amdgpu
|
||||
- mesa-amdgpu-va-drivers
|
||||
- libavcodec-dev
|
||||
- libavformat-dev
|
||||
- libavutil-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- protobuf-compiler
|
||||
- libprotoc-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
- future==1.0.0
|
||||
- pytz==2022.1
|
||||
- numpy==1.23
|
||||
- google==3.0.0
|
||||
- protobuf==3.12.4
|
||||
- onnx==1.12.0
|
||||
- nnef==1.0.7
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -33,6 +47,7 @@ parameters:
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
@@ -47,6 +62,7 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rccl
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
@@ -69,6 +85,7 @@ parameters:
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
@@ -83,6 +100,7 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rccl
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
|
||||
@@ -131,6 +131,11 @@ jobs:
|
||||
script: |
|
||||
USER_BASE=$(python3 -m site --user-base)
|
||||
echo "##vso[task.prependpath]$USER_BASE/bin"
|
||||
- task: Bash@3
|
||||
displayName: libelf hack
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: cp $(Agent.BuildDirectory)/s/cmake/Modules/Findlibelf.cmake $(Agent.BuildDirectory)/s/cmake/Modules/FindLibElf.cmake
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
|
||||
@@ -263,7 +263,7 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocWMMA:
|
||||
pipelineId: 109
|
||||
pipelineId: 370
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rpp:
|
||||
|
||||
11
CHANGELOG.md
11
CHANGELOG.md
@@ -141,7 +141,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
#### Optimized
|
||||
|
||||
* `TF32` kernel optimization for the AMD Instinct MI355X GPU to enhance training and inference efficiency.
|
||||
* Meta Model optimization for the AMD Instinct MI350X GPU to enable better performance across transformer-based models.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
@@ -579,6 +578,8 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
|
||||
* MI300A/X L2-Fabric 64B read counter may display negative values - The rocprof-compute metric 17.6.1 (Read 64B) can report negative values due to incorrect calculation when TCC_BUBBLE_sum + TCC_EA0_RDREQ_32B_sum exceeds TCC_EA0_RDREQ_sum.
|
||||
* A workaround has been implemented using max(0, calculated_value) to prevent negative display values while the root cause is under investigation.
|
||||
* The profile mode crashes when `--format-rocprof-output json` is selected.
|
||||
* As a workaround, this option should either not be provided or should be set to `csv` instead of `json`. This issue does not affect the profiling results since both `csv` and `json` output formats lead to the same profiling data.
|
||||
|
||||
### **ROCm Data Center Tool** (1.2.0)
|
||||
|
||||
@@ -694,7 +695,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
* Updated error handling for several rocRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.
|
||||
As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
|
||||
|
||||
### **rocSOLVER** (3.30.0)
|
||||
### **rocSOLVER** (3.31.0)
|
||||
|
||||
#### Added
|
||||
|
||||
@@ -1044,10 +1045,6 @@ for a complete overview of this release.
|
||||
|
||||
- `amd-smi monitor` on Linux Guest systems triggers an attribute error.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.0/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Added
|
||||
@@ -2364,7 +2361,7 @@ The previous default accumulator types could lead to situations in which unexpec
|
||||
|
||||
#### Added
|
||||
|
||||
* Hybrid computation support for existing routines: STEQR
|
||||
* Hybrid computation support for existing STEQR routines.
|
||||
|
||||
#### Optimized
|
||||
|
||||
|
||||
31
RELEASE.md
31
RELEASE.md
@@ -44,7 +44,6 @@ The following are notable new features and improvements in ROCm 7.1.0. For chang
|
||||
|
||||
ROCm 7.1.0 extends the operating system support for the following AMD hardware:
|
||||
|
||||
* AMD Instinct MI355X and MI350X GPUs add support for Debian 13.
|
||||
* AMD Instinct MI325X adds support for RHEL 10.0, SLES15 SP7, Debian 13, Debian 12, Oracle Linux 10, and Oracle Linux 9.
|
||||
* AMD Instinct MI100 adds support for SLES 15 SP7.
|
||||
|
||||
@@ -212,7 +211,6 @@ hipBLASLt introduces several performance and model compatibility improvements fo
|
||||
|
||||
* TF32 kernel optimization for AMD Instinct MI355X GPUs to enhance training and inference efficiency.
|
||||
* FP32 kernel optimization for AMD Instinct MI350X GPUs, improving precision-based workloads.
|
||||
* Meta model optimization for AMD Instinct MI350X GPUs, enabling better performance across transformer-based models.
|
||||
* Llama 2 70B model support fix for AMD Instinct MI350X GPUs: Removed incorrect kernel to ensure accurate and stable execution.
|
||||
* For AMD Instinct MI350X GPUs, added multiple high-performance kernels optimized for `FP16` and `BF16` data types, enhancing heuristic-based execution.
|
||||
* FP8 low-precision data type operations on AMD Instinct MI350X GPUs. This update adds FP8 support for the Instinct MI350X using the hipBLASLt low-precision data type functionality.
|
||||
@@ -465,7 +463,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
|
||||
</tr>
|
||||
<tr>
|
||||
<td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-7.1.0/index.html">hipFFT</a></td>
|
||||
<td>1.0.20 ⇒ <a href="#hipfft-1-0-20">1.0.21</a></td>
|
||||
<td>1.0.20 ⇒ <a href="#hipfft-1-0-21">1.0.21</a></td>
|
||||
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft"><i class="fab fa-github fa-lg"></i></a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -481,7 +479,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
|
||||
<tr>
|
||||
<td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-7.1.0/index.html">hipSOLVER</a></td>
|
||||
<td>3.0.0 ⇒ <a href="#hipsolver-3-1-0">3.1.0</a></td>
|
||||
<td><a href="https://github.com/ROCm/hipSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
|
||||
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver"><i class="fab fa-github fa-lg"></i></a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-7.1.0/index.html">hipSPARSE</a></td>
|
||||
@@ -516,7 +514,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
|
||||
<tr>
|
||||
<td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-7.1.0/index.html">rocSOLVER</a></td>
|
||||
<td>3.30.1 ⇒ <a href="#rocsolver-3-31-0">3.31.0</a></td>
|
||||
<td><a href="https://github.com/ROCm/rocSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
|
||||
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver"><i class="fab fa-github fa-lg"></i></a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-7.1.0/index.html">rocSPARSE</a></td>
|
||||
@@ -742,10 +740,6 @@ For a historical overview of ROCm component updates, see the {doc}`ROCm consolid
|
||||
|
||||
* Fixed certain output in `amd-smi monitor` when GPUs are partitioned. It fixes the issue with amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, and `amd-smi monitor -Vqt --file /tmp/test1`. These commands will now be able to display as normal in partitioned GPU scenarios.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Added
|
||||
@@ -835,7 +829,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
#### Optimized
|
||||
|
||||
* `TF32` kernel optimization for the AMD Instinct MI355X GPU to enhance training and inference efficiency.
|
||||
* Meta Model optimization for the AMD Instinct MI350X GPU to enable better performance across transformer-based models.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
@@ -1273,6 +1266,8 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
|
||||
* MI300A/X L2-Fabric 64B read counter may display negative values - The rocprof-compute metric 17.6.1 (Read 64B) can report negative values due to incorrect calculation when TCC_BUBBLE_sum + TCC_EA0_RDREQ_32B_sum exceeds TCC_EA0_RDREQ_sum.
|
||||
* A workaround has been implemented using max(0, calculated_value) to prevent negative display values while the root cause is under investigation.
|
||||
* The profile mode crashes when `--format-rocprof-output json` is selected.
|
||||
* As a workaround, this option should either not be provided or should be set to `csv` instead of `json`. This issue does not affect the profiling results since both `csv` and `json` output formats lead to the same profiling data.
|
||||
|
||||
### **ROCm Data Center Tool** (1.2.0)
|
||||
|
||||
@@ -1388,11 +1383,11 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
* Updated error handling for several rocRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.
|
||||
As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
|
||||
|
||||
### **rocSOLVER** (3.30.0)
|
||||
### **rocSOLVER** (3.31.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Hybrid computation support for existing routines: STEQR
|
||||
* Hybrid computation support for existing STEQR routines.
|
||||
|
||||
#### Optimized
|
||||
|
||||
@@ -1491,6 +1486,18 @@ ls -l /opt/rocm-7.0.0/lib/libmigraphx_py_*.so
|
||||
```
|
||||
The issue will be resolved in a future ROCm release. See [GitHub issue #5500](https://github.com/ROCm/ROCm/issues/5500).
|
||||
|
||||
### rocprofv3 fails on RPM-based OS with Python 3.10 (and later)
|
||||
|
||||
On RPM-based operating systems (such as RHEL 8), the `rocprofv3` tool fails with Python 3.10 and later due to missing ROCPD bindings. As a workaround, use Python 3.6 if you need to use the `rocprofv3` tool with ROCm 7.1.0. This issue will be fixed in a future ROCm release. See [GitHub issue #5606](https://github.com/ROCm/ROCm/issues/5606).
|
||||
|
||||
### ROCgdb might fail on SR-IOV guest VMs
|
||||
|
||||
ROCgdb might fail when running the `step-schedlock-spurious-waves.exp` test case on SR-IOV guest virtual machines (VMs). As a workaround, avoid running an inferior in ROCgdb if a background process is already heavily utilizing the GPU. The issue is currently under investigation and will be fixed in a future ROCm release. See [GitHub issue #5607](https://github.com/ROCm/ROCm/issues/5607).
|
||||
|
||||
### Issue uninstalling ROCm Bandwidth Test using amdgpu-install script
|
||||
|
||||
Due to a missing `rocm-core` dependency from the ROCm Bandwidth Test, you can't cleanly uninstall ROCm Bandwidth Test using the `amdgpu-install` script. As a workaround, uninstall ROCm Bandwidth Test manually, using the native package managers. For more information, see [Installation via native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-methods/package-manager-index.html). The issue will be fixed in a future ROCm release. See [GitHub issue #5611](https://github.com/ROCm/ROCm/issues/5611).
|
||||
|
||||
## ROCm resolved issues
|
||||
|
||||
The following are previously known issues resolved in this release. For resolved issues related to
|
||||
|
||||
@@ -49,8 +49,8 @@ ROCm Version,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.6.0,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.6.0,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
Thrust,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.0, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
@@ -96,7 +96,7 @@ ROCm Version,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25414,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
@@ -126,12 +126,12 @@ ROCm Version,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025413,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025413,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025413,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25414,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.25414,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
||||
|
@@ -66,8 +66,8 @@ compatibility and system requirements.
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
|
||||
,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
|
||||
Thrust,2.6.0,2.6.0,2.5.0
|
||||
CUB,2.6.0,2.6.0,2.5.0
|
||||
Thrust,2.8.5,2.6.0,2.5.0
|
||||
CUB,2.8.5,2.6.0,2.5.0
|
||||
,,,
|
||||
DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.0, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
@@ -113,7 +113,7 @@ compatibility and system requirements.
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.0.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25414,7.0.51831,6.4.43482
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25424,7.0.51831,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.0.2,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
@@ -143,13 +143,13 @@ compatibility and system requirements.
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025413,20.0.0.25385,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025413,20.0.0.25385,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025413,20.0.0.25385,19.0.0.25133
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25414,7.0.51831,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.25414,7.0.51831,6.4.43482
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25424,7.0.51831,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.25424,7.0.51831,6.4.43482
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
|
||||
|
||||
@@ -164,7 +164,7 @@ compatibility and system requirements.
|
||||
.. [#ol-710-mi300x] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-710-mi300x] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs.
|
||||
.. [#db-710-mi300x] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
|
||||
.. [#db12-710] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
|
||||
@@ -270,7 +270,7 @@ Expand for full historical view of:
|
||||
.. [#ol-710-mi300x-past-60] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-710-mi300x-past-60] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs.
|
||||
.. [#db-710-mi300x-past-60] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
|
||||
.. [#db12-710-past-60] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
|
||||
|
||||
@@ -0,0 +1,316 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,13 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -46,6 +46,8 @@ The following variables are generally useful for Instinct MI300X/MI355X GPUs and
|
||||
multi-GPU distributed workloads** (tensor parallelism, pipeline
|
||||
parallelism). Single-GPU inference does not need this.
|
||||
|
||||
.. _vllm-optimization-aiter-switches:
|
||||
|
||||
AITER (AI Tensor Engine for ROCm) switches
|
||||
==========================================
|
||||
|
||||
|
||||
@@ -0,0 +1,482 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker-930:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
|
||||
prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
|
||||
GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
|
||||
specifically for AMD data center GPUs and includes the following components:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-930>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||
|
||||
* Added support for AMD Instinct MI355X and MI350X GPUs.
|
||||
|
||||
* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
|
||||
|
||||
* Llama 4 Scout and Maverick
|
||||
|
||||
* DeepSeek R1 0528 FP8
|
||||
|
||||
* MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
|
||||
|
||||
* GPT OSS 20B and 120B
|
||||
|
||||
* Qwen 3 32B, 30B-A3B, and 235B-A22B
|
||||
|
||||
* Removed the deprecated ``--max-seq-len-to-capture`` flag.
|
||||
|
||||
* ``--gpu-memory-utilization`` is now configurable via the `configuration files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
|
||||
repository.
|
||||
|
||||
.. _vllm-benchmark-supported-models-930:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-930:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
documentation might vary by model -- select one to get started. MXFP4 models
|
||||
are only supported on MI355X and MI350X GPUs.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
|
||||
{% if model.precision == "float4" %}
|
||||
.. important::
|
||||
|
||||
MXFP4 is supported only on MI355X and MI350X GPUs.
|
||||
{% endif %}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
{% if model.precision == "float4" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-930:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node with the
|
||||
:literal:`{{model.precision}}` data type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-930>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
{% if model.tunableop %}
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||
operators to find the fastest one for your hardware.
|
||||
|
||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||
the ``--tunableop on`` argument in your run.
|
||||
|
||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||
performance-collection run.
|
||||
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
For more information on configuration, see the `config files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
|
||||
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||
for descriptions of available configuration options
|
||||
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||
additional benchmarking information.
|
||||
|
||||
.. rubric:: Launch the container
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||
in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. rubric:: Throughput command
|
||||
|
||||
Use the following command to start the throughput benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
num_prompts={{ model.config.num_prompts | default(1024) }}
|
||||
in={{ model.config.in | default(128) }}
|
||||
out={{ model.config.in | default(128) }}
|
||||
dtype={{ model.config.dtype | default("auto") }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm bench throughput --model $model \
|
||||
-tp $tp \
|
||||
--num-prompts $num_prompts \
|
||||
--input-len $in \
|
||||
--output-len $out \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--trust-remote-code \
|
||||
--output-json ${model}_throughput.json \
|
||||
--gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
|
||||
|
||||
.. rubric:: Serving command
|
||||
|
||||
1. Start the server using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
dtype={{ model.config.dtype }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs=256
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm serve $model \
|
||||
-tp $tp \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--no-enable-prefix-caching \
|
||||
--swap-space 16 \
|
||||
--disable-log-requests \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
Wait until the model has loaded and the server is ready to accept requests.
|
||||
|
||||
2. On another terminal on the same machine, run the benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Connect to the container
|
||||
docker exec -it test bash
|
||||
|
||||
# Wait for the server to start
|
||||
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||
|
||||
# Run the benchmark
|
||||
model={{ model.model_repo }}
|
||||
max_concurrency=1
|
||||
num_prompts=10
|
||||
in=128
|
||||
out=128
|
||||
vllm bench serve --model $model \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--dataset-name random \
|
||||
--ignore-eos \
|
||||
--max-concurrency $max_concurrency \
|
||||
--num-prompts $num_prompts \
|
||||
--random-input-len $in \
|
||||
--random-output-len $out \
|
||||
--trust-remote-code \
|
||||
--save-result \
|
||||
--result-filename ${model}_serving.json
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
|
||||
try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
|
||||
|
||||
1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
|
||||
2. Use the following command to build the image directly from the specified commit.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
.. code-block:: shell
|
||||
|
||||
docker build -f docker/Dockerfile.rocm \
|
||||
--build-arg REMOTE_VLLM=1 \
|
||||
--build-arg VLLM_REPO=https://github.com/ROCm/vllm \
|
||||
--build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
|
||||
-t vllm-rocm .
|
||||
|
||||
.. tip::
|
||||
|
||||
Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
a brief introduction to vLLM and optimization strategies.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251024``
|
||||
(latest)
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.11.1
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.10.2
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* :doc:`Documentation <vllm-0.10.2-20251006>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
|
||||
|
||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. _vllm-benchmark-unified-docker-930:
|
||||
.. _vllm-benchmark-unified-docker-1024:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
@@ -34,7 +34,7 @@ vLLM inference performance testing
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-930>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-1024>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
@@ -42,27 +42,13 @@ What's new
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||
|
||||
* Added support for AMD Instinct MI355X and MI350X GPUs.
|
||||
* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
|
||||
|
||||
* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
|
||||
* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
|
||||
|
||||
* Llama 4 Scout and Maverick
|
||||
* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
|
||||
|
||||
* DeepSeek R1 0528 FP8
|
||||
|
||||
* MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
|
||||
|
||||
* GPT OSS 20B and 120B
|
||||
|
||||
* Qwen 3 32B, 30B-A3B, and 235B-A22B
|
||||
|
||||
* Removed the deprecated ``--max-seq-len-to-capture`` flag.
|
||||
|
||||
* ``--gpu-memory-utilization`` is now configurable via the `configuration files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
|
||||
repository.
|
||||
|
||||
.. _vllm-benchmark-supported-models-930:
|
||||
.. _vllm-benchmark-supported-models-1024:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -72,7 +58,7 @@ Supported models
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-930:
|
||||
.. _vllm-benchmark-available-models-1024:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
@@ -108,7 +94,7 @@ Supported models
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-930:
|
||||
.. _vllm-benchmark-vllm-1024:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -136,7 +122,7 @@ Supported models
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-930:
|
||||
.. _vllm-benchmark-performance-measurements-1024:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -192,7 +178,7 @@ Benchmarking
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-930:
|
||||
.. _vllm-benchmark-mad-1024:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -204,7 +190,7 @@ Benchmarking
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -233,7 +219,7 @@ Benchmarking
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-930>` are preconfigured to collect
|
||||
<vllm-benchmark-available-models-1024>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
@@ -258,7 +244,7 @@ Benchmarking
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
@@ -419,6 +405,10 @@ Advanced usage
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user