Compare commits

...

3 Commits

Author SHA1 Message Date
Hongxia Yang
6e7bd88d12 Fixed wording related to VLLM_V1_USE_PREFILL_DECODE_ATTENTION 2025-10-31 13:33:53 -04:00
amd-hsivasun
e4a59d8c66 [Ex CI] Enable rocWMMA Monorepo (#5597)
* [Ex CI] Enable rocWMMA Monorepo

* Updated to use component name parameter
2025-10-30 13:43:05 -04:00
Pratik Basyal
8108fe7275 7.1.0 Post GA updates (#5600)
* Post GA updates

* Mono repo link added

* AMD SMI changelog link removed
2025-10-30 13:27:25 -04:00
5 changed files with 89 additions and 69 deletions

View File

@@ -1,10 +1,29 @@
parameters:
- name: componentName
type: string
default: rocWMMA
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -66,7 +85,11 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocWMMA_build_${{ job.target }}
- job: ${{ parameters.componentName }}_build_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.target }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -81,6 +104,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
@@ -102,9 +126,12 @@ jobs:
# gfx1030 not supported in documentation
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -112,43 +139,45 @@ jobs:
aptPackages: ${{ parameters.aptPackages }}
gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocWMMA_test_${{ job.target }}
timeoutInMinutes: 270
dependsOn: rocWMMA_build_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocWMMA
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.target }}
timeoutInMinutes: 270
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}

View File

@@ -141,7 +141,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
#### Optimized
* `TF32` kernel optimization for the AMD Instinct MI355X GPU to enhance training and inference efficiency.
* Meta Model optimization for the AMD Instinct MI350X GPU to enable better performance across transformer-based models.
#### Resolved issues
@@ -694,7 +693,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
* Updated error handling for several rocRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.
As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
### **rocSOLVER** (3.30.0)
### **rocSOLVER** (3.31.0)
#### Added
@@ -1044,10 +1043,6 @@ for a complete overview of this release.
- `amd-smi monitor` on Linux Guest systems triggers an attribute error.
```{note}
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.0/CHANGELOG.md) for details, examples, and in-depth descriptions.
```
### **Composable Kernel** (1.1.0)
#### Added

View File

@@ -44,7 +44,6 @@ The following are notable new features and improvements in ROCm 7.1.0. For chang
ROCm 7.1.0 extends the operating system support for the following AMD hardware:
* AMD Instinct MI355X and MI350X GPUs add support for Debian 13.
* AMD Instinct MI325X adds support for RHEL 10.0, SLES15 SP7, Debian 13, Debian 12, Oracle Linux 10, and Oracle Linux 9.
* AMD Instinct MI100 adds support for SLES 15 SP7.
@@ -212,7 +211,6 @@ hipBLASLt introduces several performance and model compatibility improvements fo
* TF32 kernel optimization for AMD Instinct MI355X GPUs to enhance training and inference efficiency.
* FP32 kernel optimization for AMD Instinct MI350X GPUs, improving precision-based workloads.
* Meta model optimization for AMD Instinct MI350X GPUs, enabling better performance across transformer-based models.
* Llama 2 70B model support fix for AMD Instinct MI350X GPUs: Removed incorrect kernel to ensure accurate and stable execution.
* For AMD Instinct MI350X GPUs, added multiple high-performance kernels optimized for `FP16` and `BF16` data types, enhancing heuristic-based execution.
* FP8 low-precision data type operations on AMD Instinct MI350X GPUs. This update adds FP8 support for the Instinct MI350X using the hipBLASLt low-precision data type functionality.
@@ -465,7 +463,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-7.1.0/index.html">hipFFT</a></td>
<td>1.0.20&nbsp;&Rightarrow;&nbsp;<a href="#hipfft-1-0-20">1.0.21</a></td>
<td>1.0.20&nbsp;&Rightarrow;&nbsp;<a href="#hipfft-1-0-21">1.0.21</a></td>
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft"><i class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
@@ -481,7 +479,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-7.1.0/index.html">hipSOLVER</a></td>
<td>3.0.0&nbsp;&Rightarrow;&nbsp;<a href="#hipsolver-3-1-0">3.1.0</a></td>
<td><a href="https://github.com/ROCm/hipSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver"><i class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-7.1.0/index.html">hipSPARSE</a></td>
@@ -516,7 +514,7 @@ Click {fab}`github` to go to the component's source code on GitHub.
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-7.1.0/index.html">rocSOLVER</a></td>
<td>3.30.1&nbsp;&Rightarrow;&nbsp;<a href="#rocsolver-3-31-0">3.31.0</a></td>
<td><a href="https://github.com/ROCm/rocSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
<td><a href="https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver"><i class="fab fa-github fa-lg"></i></a></td>
</tr>
<tr>
<td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-7.1.0/index.html">rocSPARSE</a></td>
@@ -742,10 +740,6 @@ For a historical overview of ROCm component updates, see the {doc}`ROCm consolid
* Fixed certain output in `amd-smi monitor` when GPUs are partitioned. It fixes the issue with amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, and `amd-smi monitor -Vqt --file /tmp/test1`. These commands will now be able to display as normal in partitioned GPU scenarios.
```{note}
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md) for details, examples, and in-depth descriptions.
```
### **Composable Kernel** (1.1.0)
#### Added
@@ -835,7 +829,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
#### Optimized
* `TF32` kernel optimization for the AMD Instinct MI355X GPU to enhance training and inference efficiency.
* Meta Model optimization for the AMD Instinct MI350X GPU to enable better performance across transformer-based models.
#### Resolved issues
@@ -1388,7 +1381,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
* Updated error handling for several rocRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.
As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
### **rocSOLVER** (3.30.0)
### **rocSOLVER** (3.31.0)
#### Added

View File

@@ -164,7 +164,7 @@ compatibility and system requirements.
.. [#ol-710-mi300x] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
.. [#db-710-mi300x] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs.
.. [#db-710-mi300x] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
.. [#db12-710] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
@@ -270,7 +270,7 @@ Expand for full historical view of:
.. [#ol-710-mi300x-past-60] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
.. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
.. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
.. [#db-710-mi300x-past-60] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs.
.. [#db-710-mi300x-past-60] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
.. [#db12-710-past-60] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
.. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
.. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.

View File

@@ -65,7 +65,7 @@ Quick start examples:
export VLLM_ROCM_USE_AITER=1
vllm serve MODEL_NAME
# Enable only AITER Triton Prefill-Decode (split) attention
# Enable AITER Fused MoE and enable Triton Prefill-Decode (split) attention
export VLLM_ROCM_USE_AITER=1
export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
@@ -242,14 +242,17 @@ Most users won't need this, but you can override the defaults:
* - AITER MHA (standard models)
- ``VLLM_ROCM_USE_AITER=1`` (auto-selects for non-MLA models)
* - AITER Triton Prefill-Decode (split)
* - vLLM Triton Unified (default)
- ``VLLM_ROCM_USE_AITER=0`` (or unset)
* - Triton Prefill-Decode (split) without AITER
- | ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
* - Triton Prefill-Decode (split) along with AITER Fused-MoE
- | ``VLLM_ROCM_USE_AITER=1``
| ``VLLM_ROCM_USE_AITER_MHA=0``
| ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
* - vLLM Triton Unified (default)
- ``VLLM_ROCM_USE_AITER=0`` (or unset)
* - AITER Unified Attention
- | ``VLLM_ROCM_USE_AITER=1``
| ``VLLM_ROCM_USE_AITER_MHA=0``
@@ -267,11 +270,11 @@ Most users won't need this, but you can override the defaults:
--block-size 1 \
--tensor-parallel-size 8
# Advanced: Use Prefill-Decode split (for short input cases)
# Advanced: Use Prefill-Decode split (for short input cases) with AITER Fused-MoE
VLLM_ROCM_USE_AITER=1 \
VLLM_ROCM_USE_AITER_MHA=0 \
VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 \
vllm serve meta-llama/Llama-3.3-70B-Instruct
vllm serve meta-llama/Llama-4-Scout-17B-16E
**Which backend should I choose?**
@@ -350,14 +353,14 @@ vLLM V1 on ROCm provides these attention implementations:
3. **AITER Triton PrefillDecode Attention** (hybrid, Instinct MI300X-optimized)
* Enable with ``VLLM_ROCM_USE_AITER=1``, ``VLLM_ROCM_USE_AITER_MHA=0``, and ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
* Enable with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``
* Uses separate kernels for prefill and decode phases:
* **Prefill**: ``context_attention_fwd`` Triton kernel
* **Primary decode**: ``torch.ops._rocm_C.paged_attention`` (custom ROCm kernel optimized for head sizes 64/128, block sizes 16/32, GQA 116, context ≤131k; sliding window not supported)
* **Fallback decode**: ``kernel_paged_attention_2d`` Triton kernel when shapes don't meet primary decode requirements
* Usually better compared to unified Triton kernels (both vLLM and AITER variants)
* Usually better compared to unified Triton kernels
* Performance vs AITER MHA varies: AITER MHA is typically faster overall, but Prefill-Decode split may win in short input scenarios
* The custom paged attention decode kernel is controlled by ``VLLM_ROCM_CUSTOM_PAGED_ATTN`` (default **True**)