From eba211d7f156098d6379ae2472ab7c45a7f4b94b Mon Sep 17 00:00:00 2001 From: kiritigowda Date: Thu, 16 Oct 2025 15:22:27 -0700 Subject: [PATCH 01/15] CTest - Output verbose --- .azuredevops/components/hipTensor.yml | 2 +- .azuredevops/components/rocm-cmake.yml | 2 +- .azuredevops/templates/steps/test.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.azuredevops/components/hipTensor.yml b/.azuredevops/components/hipTensor.yml index 30db323a2..dddddad4e 100644 --- a/.azuredevops/components/hipTensor.yml +++ b/.azuredevops/components/hipTensor.yml @@ -130,7 +130,7 @@ jobs: parameters: componentName: hipTensor testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor' - testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml' + testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml' - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml parameters: aptPackages: ${{ parameters.aptPackages }} diff --git a/.azuredevops/components/rocm-cmake.yml b/.azuredevops/components/rocm-cmake.yml index 81fb4ab6b..b54d3c1cb 100644 --- a/.azuredevops/components/rocm-cmake.yml +++ b/.azuredevops/components/rocm-cmake.yml @@ -81,7 +81,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml parameters: componentName: rocm-cmake - testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml' + testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml' os: ${{ job.os }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml parameters: diff --git a/.azuredevops/templates/steps/test.yml b/.azuredevops/templates/steps/test.yml index 5b582e8ae..c03850116 100644 --- a/.azuredevops/templates/steps/test.yml +++ b/.azuredevops/templates/steps/test.yml @@ -13,7 +13,7 @@ parameters: default: ctest - name: testParameters type: string - default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml + default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml - name: extraTestParameters type: string default: '' From b3459da524f479f0c13285a3f98f2cd6c63f4f7f Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Fri, 17 Oct 2025 14:02:54 +0200 Subject: [PATCH 02/15] [Ex CI] Add libomp-dev, MIVisionX, rocDecode --- .azuredevops/components/rocm-examples.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.azuredevops/components/rocm-examples.yml b/.azuredevops/components/rocm-examples.yml index bba5f7a20..cd9343bb3 100644 --- a/.azuredevops/components/rocm-examples.yml +++ b/.azuredevops/components/rocm-examples.yml @@ -17,10 +17,14 @@ parameters: - libdw-dev - libglfw3-dev - libmsgpack-dev + - libomp-dev - libopencv-dev - libtbb-dev - libtiff-dev - libva-amdgpu-dev + - libavcodec-dev + - libavformat-dev + - libavutil-dev - ninja-build - python3-pip - name: rocmDependencies @@ -40,7 +44,9 @@ parameters: - hipTensor - llvm-project - MIOpen + - MIVisionX - rocBLAS + - rocDecode - rocFFT - rocJPEG - rocPRIM @@ -70,7 +76,9 @@ parameters: - hipTensor - llvm-project - MIOpen + - MIVisionX - rocBLAS + - rocDecode - rocFFT - rocminfo - rocPRIM From fd6bbe18a7115527f3ec722f90711ee40e1fd896 Mon Sep 17 00:00:00 2001 From: Pratik Basyal Date: Fri, 17 Oct 2025 17:13:42 -0400 Subject: [PATCH 03/15] PLDM update for MI250 and MI210 [Develop] (#5537) * PLDM update for MI250 and MI210 * PLDM update --- RELEASE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 06d22a836..6efa0b566 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -149,11 +149,11 @@ firmware, AMD GPU drivers, and the ROCm user space software. MI250 - MU5 w/ IFWI 75 (or later) + MU3 w/ IFWI 73 MI210 - MU5 w/ IFWI 75 (or later) + MU3 w/ IFWI 73 8.4.0.K From a5f0b30a4769348318a9083678c6811f09eb3bb6 Mon Sep 17 00:00:00 2001 From: Pratik Basyal Date: Mon, 20 Oct 2025 14:39:17 -0400 Subject: [PATCH 04/15] PLDM version update for MI350 series [Develop] (#5547) * PLDM version update for MI350 series * Minor update --- RELEASE.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 6efa0b566..08ea37b53 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -91,7 +91,7 @@ firmware, AMD GPU drivers, and the ROCm user space software. ROCm 7.0.2 MI355X - 01.25.15.02 (or later)
+ 01.25.15.04
01.25.13.09 30.10.2
@@ -102,7 +102,7 @@ firmware, AMD GPU drivers, and the ROCm user space software. MI350X - 01.25.15.02 (or later)
+ 01.25.15.04
01.25.13.09 30.10.2
@@ -112,7 +112,7 @@ firmware, AMD GPU drivers, and the ROCm user space software. MI325X - 01.25.04.02 (or later)
+ 01.25.04.02
01.25.03.03 @@ -139,13 +139,13 @@ firmware, AMD GPU drivers, and the ROCm user space software. MI300A - BKC 26 (or later)
+ BKC 26
BKC 25 Not Applicable MI250X - IFWI 47 (or later) + IFWI 47 MI250 From 8eb5fef37c7db3c29dc90d85832468c142e24982 Mon Sep 17 00:00:00 2001 From: anisha-amd Date: Tue, 21 Oct 2025 16:12:18 -0400 Subject: [PATCH 05/15] Docs: frameworks compatibility standardization (#5488) --- .wordlist.txt | 6 +- .../ml-compatibility/dgl-compatibility.rst | 133 ++++++++++-------- .../flashinfer-compatibility.rst | 39 ++--- .../ml-compatibility/jax-compatibility.rst | 64 ++++----- .../llama-cpp-compatibility.rst | 55 ++++---- .../megablocks-compatibility.rst | 63 ++++++--- .../pytorch-compatibility.rst | 53 +++---- .../ml-compatibility/ray-compatibility.rst | 64 ++++----- .../stanford-megatron-lm-compatibility.rst | 78 ++++++---- .../ml-compatibility/taichi-compatibility.rst | 57 +++++--- .../tensorflow-compatibility.rst | 52 ++++--- .../ml-compatibility/verl-compatibility.rst | 60 ++++++-- 12 files changed, 426 insertions(+), 298 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index 8e7c9ba62..294016553 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -34,6 +34,7 @@ AlexNet Andrej Arb Autocast +autograd BARs BatchNorm BLAS @@ -86,9 +87,11 @@ Conda ConnectX CountOnes CuPy +customizable da Dashboarding Dataloading +dataflows DBRX DDR DF @@ -182,7 +185,7 @@ GPT GPU GPU's GPUs -Graphbolt +GraphBolt GraphSage GRBM GRE @@ -212,6 +215,7 @@ Haswell Higgs href Hyperparameters +HybridEngine Huggingface IB ICD diff --git a/docs/compatibility/ml-compatibility/dgl-compatibility.rst b/docs/compatibility/ml-compatibility/dgl-compatibility.rst index 7c61515ec..3c18ce100 100644 --- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst +++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: Deep Graph Library (DGL) compatibility - :keywords: GPU, DGL compatibility + :keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -10,24 +10,42 @@ DGL compatibility ******************************************************************************** -Deep Graph Library `(DGL) `_ is an easy-to-use, high-performance and scalable +Deep Graph Library (`DGL `__) is an easy-to-use, high-performance, and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning -if a deep graph model is a component in an end-to-end application, the rest of +that if a deep graph model is a component in an end-to-end application, the rest of the logic is implemented using PyTorch. -* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl `_ repository. -* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl `_ upstream repository. -* Use the prebuilt :ref:`Docker images ` with DGL, PyTorch, and ROCm preinstalled. -* See the :doc:`ROCm DGL installation guide ` - to install and get started. +DGL provides a high-performance graph object that can reside on either CPUs or GPUs. +It bundles structural data features for better control and provides a variety of functions +for computing with graph objects, including efficient and customizable message passing +primitives for Graph Neural Networks. - -Supported devices +Support overview ================================================================================ -- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt) -- **Partially Supported**: TF32 with AMD Instinct MI250X +- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl + `__ repository, which differs from the + `https://github.com/dmlc/dgl `__ upstream repository. +- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, DGL, and all required dependencies. + + - See the :doc:`ROCm DGL installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +DGL is supported on `ROCm 6.4.0 `__. + +Supported devices +-------------------------------------------------------------------------------- + +- **Officially Supported**: AMD Instinct™ MI300X (through `hipBLASlt `__) +- **Partially Supported**: AMD Instinct™ MI250X .. _dgl-recommendations: @@ -35,7 +53,7 @@ Use cases and recommendations ================================================================================ DGL can be used for Graph Learning, and building popular graph models like -GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as: +GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported: - Recommender systems - Network Optimization and Analysis @@ -62,16 +80,17 @@ Docker image compatibility -AMD validates and publishes `DGL images `_ -with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated -inventories were tested on `ROCm 6.4.0 `_. +AMD validates and publishes `DGL images `__ +with ROCm backends on Docker Hub. The following Docker image tags and associated +inventories represent the latest available DGL version from the official Docker Hub. Click the |docker-icon| to view the image on Docker Hub. .. list-table:: DGL Docker image components :header-rows: 1 :class: docker-image-compatibility - * - Docker + * - Docker image + - ROCm - DGL - PyTorch - Ubuntu @@ -81,102 +100,106 @@ Click the |docker-icon| to view the image on Docker Hub. - - `2.4.0 `_ - - `2.6.0 `_ + - `6.4.0 `__. + - `2.4.0 `__ + - `2.6.0 `__ - 24.04 - - `3.12.9 `_ + - `3.12.9 `__ * - .. raw:: html - - `2.4.0 `_ - - `2.4.1 `_ + - `6.4.0 `__. + - `2.4.0 `__ + - `2.4.1 `__ - 24.04 - - `3.12.9 `_ + - `3.12.9 `__ * - .. raw:: html - - `2.4.0 `_ - - `2.4.1 `_ + - `6.4.0 `__. + - `2.4.0 `__ + - `2.4.1 `__ - 22.04 - - `3.10.16 `_ + - `3.10.16 `__ * - .. raw:: html - - `2.4.0 `_ - - `2.3.0 `_ + - `6.4.0 `__. + - `2.4.0 `__ + - `2.3.0 `__ - 22.04 - - `3.10.16 `_ + - `3.10.16 `__ Key ROCm libraries for DGL ================================================================================ DGL on ROCm depends on specific libraries that affect its features and performance. -Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended. +Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended. If you prefer to build it yourself, ensure the following dependencies are installed: .. list-table:: :header-rows: 1 * - ROCm library - - Version + - ROCm 6.4.0 Version - Purpose * - `Composable Kernel `_ - - :version-ref:`"Composable Kernel" rocm_version` + - 1.1.0 - Enables faster execution of core operations like matrix multiplication (GEMM), convolutions and transformations. * - `hipBLAS `_ - - :version-ref:`hipBLAS rocm_version` + - 2.4.0 - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for matrix and vector operations. * - `hipBLASLt `_ - - :version-ref:`hipBLASLt rocm_version` + - 0.12.0 - hipBLASLt is an extension of the hipBLAS library, providing additional features like epilogues fused into the matrix multiplication kernel or use of integer tensor cores. * - `hipCUB `_ - - :version-ref:`hipCUB rocm_version` + - 3.4.0 - Provides a C++ template library for parallel algorithms for reduction, scan, sort and select. * - `hipFFT `_ - - :version-ref:`hipFFT rocm_version` + - 1.0.18 - Provides GPU-accelerated Fast Fourier Transform (FFT) operations. * - `hipRAND `_ - - :version-ref:`hipRAND rocm_version` + - 2.12.0 - Provides fast random number generation for GPUs. * - `hipSOLVER `_ - - :version-ref:`hipSOLVER rocm_version` + - 2.4.0 - Provides GPU-accelerated solvers for linear systems, eigenvalues, and singular value decompositions (SVD). * - `hipSPARSE `_ - - :version-ref:`hipSPARSE rocm_version` + - 3.2.0 - Accelerates operations on sparse matrices, such as sparse matrix-vector or matrix-matrix products. * - `hipSPARSELt `_ - - :version-ref:`hipSPARSELt rocm_version` + - 0.2.3 - Accelerates operations on sparse matrices, such as sparse matrix-vector or matrix-matrix products. * - `hipTensor `_ - - :version-ref:`hipTensor rocm_version` + - 1.5.0 - Optimizes for high-performance tensor operations, such as contractions. * - `MIOpen `_ - - :version-ref:`MIOpen rocm_version` + - 3.4.0 - Optimizes deep learning primitives such as convolutions, pooling, normalization, and activation functions. * - `MIGraphX `_ - - :version-ref:`MIGraphX rocm_version` + - 2.12.0 - Adds graph-level optimizations, ONNX models and mixed precision support and enable Ahead-of-Time (AOT) Compilation. * - `MIVisionX `_ - - :version-ref:`MIVisionX rocm_version` + - 3.2.0 - Optimizes acceleration for computer vision and AI workloads like preprocessing, augmentation, and inferencing. * - `rocAL `_ @@ -184,25 +207,25 @@ If you prefer to build it yourself, ensure the following dependencies are instal - Accelerates the data pipeline by offloading intensive preprocessing and augmentation tasks. rocAL is part of MIVisionX. * - `RCCL `_ - - :version-ref:`RCCL rocm_version` + - 2.2.0 - Optimizes for multi-GPU communication for operations like AllReduce and Broadcast. * - `rocDecode `_ - - :version-ref:`rocDecode rocm_version` + - 0.10.0 - Provides hardware-accelerated data decoding capabilities, particularly for image, video, and other dataset formats. * - `rocJPEG `_ - - :version-ref:`rocJPEG rocm_version` + - 0.8.0 - Provides hardware-accelerated JPEG image decoding and encoding. * - `RPP `_ - - :version-ref:`RPP rocm_version` + - 1.9.10 - Speeds up data augmentation, transformation, and other preprocessing steps. * - `rocThrust `_ - - :version-ref:`rocThrust rocm_version` + - 3.3.0 - Provides a C++ template library for parallel algorithms like sorting, reduction, and scanning. * - `rocWMMA `_ - - :version-ref:`rocWMMA rocm_version` + - 1.7.0 - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix multiplication (GEMM) and accumulation operations with mixed precision support. @@ -211,14 +234,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal Supported features ================================================================================ -Many functions and methods available in DGL Upstream are also supported in DGL ROCm. +Many functions and methods available upstream are also supported in DGL on ROCm. Instead of listing them all, support is grouped into the following categories to provide a general overview. * DGL Base * DGL Backend * DGL Data * DGL Dataloading -* DGL DGLGraph +* DGL Graph * DGL Function * DGL Ops * DGL Sampling @@ -235,9 +258,9 @@ Instead of listing them all, support is grouped into the following categories to Unsupported features ================================================================================ -* Graphbolt -* Partial TF32 Support (MI250x only) -* Kineto/ ROCTracer integration +* GraphBolt +* Partial TF32 Support (MI250X only) +* Kineto/ROCTracer integration Unsupported functions diff --git a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst index 45ecc6a75..700186c73 100644 --- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst +++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst @@ -1,8 +1,8 @@ :orphan: .. meta:: - :description: FlashInfer deep learning framework compatibility - :keywords: GPU, LLM, FlashInfer, compatibility + :description: FlashInfer compatibility + :keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -11,7 +11,7 @@ FlashInfer compatibility ******************************************************************************** `FlashInfer `__ is a library and kernel generator -for Large Language Models (LLMs) that provides high-performance implementation of graphics +for Large Language Models (LLMs) that provides a high-performance implementation of graphics processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well as advanced performance across diverse scenarios. @@ -25,28 +25,30 @@ offers high-performance LLM-specific operators, with easy integration through Py For the latest feature compatibility matrix, refer to the ``README`` of the `https://github.com/ROCm/flashinfer `__ repository. -Support for the ROCm port of FlashInfer is available as follows: +Support overview +================================================================================ -- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer - `__ repository. This location differs from the - `https://github.com/flashinfer-ai/flashinfer `_ +- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer + `__ repository, which differs from the + `https://github.com/flashinfer-ai/flashinfer `__ upstream repository. -- To install FlashInfer, use the prebuilt :ref:`Docker image `, - which includes ROCm, FlashInfer, and all required dependencies. +- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, FlashInfer, and all required dependencies. - See the :doc:`ROCm FlashInfer installation guide ` - to install and get started. + for installation and setup instructions. - - See the `Installation guide `__ - in the upstream FlashInfer documentation. + - You can also consult the upstream `Installation guide `__ + for additional context. -.. note:: +Version support +-------------------------------------------------------------------------------- - Flashinfer is supported on ROCm 6.4.1. +FlashInfer is supported on `ROCm 6.4.1 `__. Supported devices -================================================================================ +-------------------------------------------------------------------------------- **Officially Supported**: AMD Instinct™ MI300X @@ -78,10 +80,9 @@ Docker image compatibility -AMD validates and publishes `ROCm FlashInfer images `__ -with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated -inventories represent the FlashInfer version from the official Docker Hub. -The Docker images have been validated for `ROCm 6.4.1 `__. +AMD validates and publishes `FlashInfer images `__ +with ROCm backends on Docker Hub. The following Docker image tag and associated +inventories represent the latest available FlashInfer version from the official Docker Hub. Click |docker-icon| to view the image on Docker Hub. .. list-table:: diff --git a/docs/compatibility/ml-compatibility/jax-compatibility.rst b/docs/compatibility/ml-compatibility/jax-compatibility.rst index 121e4d126..8308a8efc 100644 --- a/docs/compatibility/ml-compatibility/jax-compatibility.rst +++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: JAX compatibility - :keywords: GPU, JAX compatibility + :keywords: GPU, JAX, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -10,42 +10,38 @@ JAX compatibility ******************************************************************************* -JAX provides a NumPy-like API, which combines automatic differentiation and the -Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine -learning at scale. +`JAX `__ is a library +for array-oriented numerical computation (similar to NumPy), with automatic differentiation +and just-in-time (JIT) compilation to enable high-performance machine learning research. -JAX uses composable transformations of Python and NumPy through just-in-time -(JIT) compilation, automatic vectorization, and parallelization. To learn about -JAX, including profiling and optimizations, see the official `JAX documentation -`_. +JAX provides an API that combines automatic differentiation and the +Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine +learning at scale. JAX uses composable transformations of Python and NumPy through +JIT compilation, automatic vectorization, and parallelization. -ROCm support for JAX is upstreamed, and users can build the official source code -with ROCm support: +Support overview +================================================================================ -- ROCm JAX release: +- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax + `__ repository, which differs from the + `https://github.com/jax-ml/jax `__ upstream repository. - - Offers AMD-validated and community :ref:`Docker images ` - with ROCm and JAX preinstalled. +- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, JAX, and all required dependencies. - - ROCm JAX repository: `ROCm/rocm-jax `_ + - See the :doc:`ROCm JAX installation guide ` + for installation and setup instructions. - - See the :doc:`ROCm JAX installation guide ` - to get started. + - You can also consult the upstream `Installation guide `__ + for additional context. -- Official JAX release: +Version support +-------------------------------------------------------------------------------- - - Official JAX repository: `jax-ml/jax `_ - - - See the `AMD GPU (Linux) installation section - `_ in - the JAX documentation. - -.. note:: - - AMD releases official `ROCm JAX Docker images `_ - quarterly alongside new ROCm releases. These images undergo full AMD testing. - `Community ROCm JAX Docker images `_ - follow upstream JAX releases and use the latest available ROCm version. +AMD releases official `ROCm JAX Docker images `_ +quarterly alongside new ROCm releases. These images undergo full AMD testing. +`Community ROCm JAX Docker images `_ +follow upstream JAX releases and use the latest available ROCm version. Use cases and recommendations ================================================================================ @@ -71,7 +67,7 @@ Use cases and recommendations * The `Distributed fine-tuning with JAX on AMD GPUs `_ outlines the process of fine-tuning a Bidirectional Encoder Representations from Transformers (BERT)-based large language model (LLM) using JAX for a text - classification task. The blog post discuss techniques for parallelizing the + classification task. The blog post discusses techniques for parallelizing the fine-tuning across multiple AMD GPUs and assess the model's performance on a holdout dataset. During the fine-tuning, a BERT-base-cased transformer model and the General Language Understanding Evaluation (GLUE) benchmark dataset was @@ -90,9 +86,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts `__ and are the -recommended way to get started with deep learning with JAX on ROCm. +AMD validates and publishes `JAX images `__ +with ROCm backends on Docker Hub. + For ``jax-community`` images, see `rocm/jax-community `__ on Docker Hub. @@ -234,7 +230,7 @@ The ROCm supported data types in JAX are collected in the following table. .. note:: - JAX data type support is effected by the :ref:`key_rocm_libraries` and it's + JAX data type support is affected by the :ref:`key_rocm_libraries` and it's collected on :doc:`ROCm data types and precision support ` page. diff --git a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst index 902c61a2a..b79baf253 100644 --- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst +++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst @@ -1,8 +1,8 @@ :orphan: .. meta:: - :description: llama.cpp deep learning framework compatibility - :keywords: GPU, GGML, llama.cpp compatibility + :description: llama.cpp compatibility + :keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -20,33 +20,32 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first llama.cpp is easy to integrate with other programming environments and is widely adopted across diverse platforms, including consumer devices. -ROCm support for llama.cpp is upstreamed, and you can build the official source code -with ROCm support: - -- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp - `_ repository. - -- Due to independent compatibility considerations, this location differs from the - `https://github.com/ggml-org/llama.cpp `_ upstream repository. - -- To install llama.cpp, use the prebuilt :ref:`Docker image `, - which includes ROCm, llama.cpp, and all required dependencies. - - - See the :doc:`ROCm llama.cpp installation guide ` - to install and get started. - - - See the `Installation guide `__ - in the upstream llama.cpp documentation. - -.. note:: - - llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x. - -Supported devices +Support overview ================================================================================ -**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210 +- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp + `__ repository, which differs from the + `https://github.com/ggml-org/llama.cpp `__ upstream repository. +- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, llama.cpp, and all required dependencies. + + - See the :doc:`ROCm llama.cpp installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +llama.cpp is supported on `ROCm 7.0.0 `__ and +`ROCm 6.4.x `__. + +Supported devices +-------------------------------------------------------------------------------- + +**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210 Use cases and recommendations ================================================================================ @@ -84,9 +83,9 @@ Docker image compatibility -AMD validates and publishes `ROCm llama.cpp Docker images `__ +AMD validates and publishes `llama.cpp images `__ with ROCm backends on Docker Hub. The following Docker image tags and associated -inventories represent the available llama.cpp versions from the official Docker Hub. +inventories represent the latest available llama.cpp versions from the official Docker Hub. Click |docker-icon| to view the image on Docker Hub. .. important:: diff --git a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst index 50c2c3821..5716ececb 100644 --- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst +++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: Megablocks compatibility - :keywords: GPU, megablocks, compatibility + :keywords: GPU, megablocks, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -10,28 +10,42 @@ Megablocks compatibility ******************************************************************************** -Megablocks is a light-weight library for mixture-of-experts (MoE) training. +`Megablocks `__ is a lightweight library +for mixture-of-experts `(MoE) `__ training. The core of the system is efficient "dropless-MoE" and standard MoE layers. -Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM `_, +Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM +`__, where data and pipeline parallel training of MoEs is supported. -* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks `_ repository. -* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM `_ upstream repository. -* Use the prebuilt :ref:`Docker image ` with ROCm, PyTorch, and Megablocks preinstalled. -* See the :doc:`ROCm Megablocks installation guide ` to install and get started. +Support overview +================================================================================ -.. note:: +- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks + `__ repository, which differs from the + `https://github.com/stanford-futuredata/Megatron-LM `__ upstream repository. - Megablocks is supported on ROCm 6.3.0. +- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image `, + which includes ROCm, Megablocks, and all required dependencies. + + - See the :doc:`ROCm Megablocks installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +Megablocks is supported on `ROCm 6.3.0 `__. Supported devices -================================================================================ +-------------------------------------------------------------------------------- -- **Officially Supported**: AMD Instinct MI300X -- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210 +- **Officially Supported**: AMD Instinct™ MI300X +- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210 Supported models and features -================================================================================ +-------------------------------------------------------------------------------- This section summarizes the Megablocks features supported by ROCm. @@ -41,20 +55,28 @@ This section summarizes the Megablocks features supported by ROCm. * Mixture-of-Experts * dropless-Mixture-of-Experts - .. _megablocks-recommendations: Use cases and recommendations ================================================================================ -The `ROCm Megablocks blog posts `_ -guide how to leverage the ROCm platform for pre-training using the Megablocks framework. +* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs + `__ + blog post guides how to leverage the ROCm platform for pre-training using the + Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts + (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it + demonstrates how block-sparse computations can enhance scalability and efficiency in MoE + training. The guide provides step-by-step instructions for setting up the environment, + including cloning the repository, building the Docker image, and running the training container. + Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training + language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE + training workflows for large-scale transformer models. + It features how to pre-process datasets and how to begin pre-training on AMD GPUs through: * Single-GPU pre-training * Multi-GPU pre-training - .. _megablocks-docker-compat: Docker image compatibility @@ -64,10 +86,9 @@ Docker image compatibility -AMD validates and publishes `ROCm Megablocks images `_ -with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated -inventories represent the latest Megatron-LM version from the official Docker Hub. -The Docker images have been validated for `ROCm 6.3.0 `_. +AMD validates and publishes `Megablocks images `__ +with ROCm backends on Docker Hub. The following Docker image tag and associated +inventories represent the latest available Megablocks version from the official Docker Hub. Click |docker-icon| to view the image on Docker Hub. .. list-table:: diff --git a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst index 19901b7cd..54365a72f 100644 --- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst +++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: PyTorch compatibility - :keywords: GPU, PyTorch compatibility + :keywords: GPU, PyTorch, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -15,40 +15,42 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training using `MIOpen `__ and `RCCL `__ libraries. -ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due -to independent compatibility considerations, this results in two distinct -release cycles for PyTorch on ROCm: +PyTorch provides two high-level features: -- ROCm PyTorch release: +- Tensor computation (like NumPy) with strong GPU acceleration - - Provides the latest version of ROCm but might not necessarily support the - latest stable PyTorch version. +- Deep neural networks built on a tape-based autograd system (rapid computation + of multiple partial derivatives or gradients) - - Offers :ref:`Docker images ` with ROCm and PyTorch - preinstalled. +Support overview +================================================================================ - - ROCm PyTorch repository: ``__ +ROCm support for PyTorch is upstreamed into the official PyTorch repository. +ROCm development is aligned with the stable release of PyTorch, while upstream +PyTorch testing uses the stable release of ROCm to maintain consistency: - - See the :doc:`ROCm PyTorch installation guide ` - to get started. +- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch + `__ repository, which differs from the + `https://github.com/pytorch/pytorch `__ upstream repository. -- Official PyTorch release: +- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, PyTorch, and all required dependencies. - - Provides the latest stable version of PyTorch but might not necessarily - support the latest ROCm version. + - See the :doc:`ROCm PyTorch installation guide ` + for installation and setup instructions. - - Official PyTorch repository: ``__ - - - See the `Nightly and latest stable version installation guide `__ - or `Previous versions `__ - to get started. + - You can also consult the upstream `Installation guide `__ or + `Previous versions `__ for additional context. PyTorch includes tooling that generates HIP source code from the CUDA backend. This approach allows PyTorch to support ROCm without requiring manual code modifications. For more information, see :doc:`HIPIFY `. -ROCm development is aligned with the stable release of PyTorch, while upstream -PyTorch testing uses the stable release of ROCm to maintain consistency. +Version support +-------------------------------------------------------------------------------- + +AMD releases official `ROCm PyTorch Docker images `_ +quarterly alongside new ROCm releases. These images undergo full AMD testing. .. _pytorch-recommendations: @@ -78,7 +80,7 @@ Use cases and recommendations GPU. * The :doc:`Inception with PyTorch documentation ` - describes how PyTorch integrates with ROCm for AI workloads It outlines the + describes how PyTorch integrates with ROCm for AI workloads. It outlines the use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD GPU hardware for training and inference tasks in AI applications. @@ -89,9 +91,8 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts `__ and are the -recommended way to get started with deep learning with PyTorch on ROCm. +AMD validates and publishes `PyTorch images `__ +with ROCm backends on Docker Hub. To find the right image tag, see the :ref:`PyTorch on ROCm installation documentation ` for a list of diff --git a/docs/compatibility/ml-compatibility/ray-compatibility.rst b/docs/compatibility/ml-compatibility/ray-compatibility.rst index 2f5c83589..428d750b3 100644 --- a/docs/compatibility/ml-compatibility/ray-compatibility.rst +++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst @@ -1,8 +1,8 @@ :orphan: .. meta:: - :description: Ray deep learning framework compatibility - :keywords: GPU, Ray compatibility + :description: Ray compatibility + :keywords: GPU, Ray, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -19,36 +19,35 @@ simplifying machine learning computations. Ray is a general-purpose framework that runs many types of workloads efficiently. Any Python application can be scaled with Ray, without extra infrastructure. -ROCm support for Ray is upstreamed, and you can build the official source code -with ROCm support: - -- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray - `_ repository. - -- Due to independent compatibility considerations, this location differs from the - `https://github.com/ray-project/ray `_ upstream repository. - -- To install Ray, use the prebuilt :ref:`Docker image ` - which includes ROCm, Ray, and all required dependencies. - - - See the :doc:`ROCm Ray installation guide ` - for instructions to get started. - - - See the `Installation section `_ - in the upstream Ray documentation. - - - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels `__ - corresponding to commit `005c372 `__. - -.. note:: - - Ray is supported on ROCm 6.4.1. - -Supported devices +Support overview ================================================================================ -**Officially Supported**: AMD Instinct™ MI300X, MI210 +- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray + `__ repository, which differs from the + `https://github.com/ray-project/ray `__ upstream repository. +- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image `, + which includes ROCm, Ray, and all required dependencies. + + - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels + `__ + corresponding to commit `005c372 `__. + + - See the :doc:`ROCm Ray installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +Ray is supported on `ROCm 6.4.1 `__. + +Supported devices +-------------------------------------------------------------------------------- + +**Officially Supported**: AMD Instinct™ MI300X, MI210 Use cases and recommendations ================================================================================ @@ -88,15 +87,15 @@ Docker image compatibility AMD validates and publishes ready-made `ROCm Ray Docker images `__ with ROCm backends on Docker Hub. The following Docker image tags and -associated inventories represent the latest Ray version from the official Docker Hub and are validated for -`ROCm 6.4.1 `_. Click the |docker-icon| -icon to view the image on Docker Hub. +associated inventories represent the latest Ray version from the official Docker Hub. +Click the |docker-icon| icon to view the image on Docker Hub. .. list-table:: :header-rows: 1 :class: docker-image-compatibility * - Docker image + - ROCm - Ray - Pytorch - Ubuntu @@ -105,6 +104,7 @@ icon to view the image on Docker Hub. * - .. raw:: html rocm/ray + - `6.4.1 `__. - `2.48.0.post0 `_ - 2.6.0+git684f6f2 - 24.04 diff --git a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst index 1550a82d1..f3e2badb7 100644 --- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst +++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: Stanford Megatron-LM compatibility - :keywords: Stanford, Megatron-LM, compatibility + :keywords: Stanford, Megatron-LM, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -10,34 +10,50 @@ Stanford Megatron-LM compatibility ******************************************************************************** -Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM `_. It is -designed to train massive transformer-based language models efficiently by model and data parallelism. +Stanford Megatron-LM is a large-scale language model training framework developed +by NVIDIA at `https://github.com/NVIDIA/Megatron-LM `_. +It is designed to train massive transformer-based language models efficiently by model +and data parallelism. -* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM `_ repository. -* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM `_ upstream repository. -* Use the prebuilt :ref:`Docker image ` with ROCm, PyTorch, and Megatron-LM preinstalled. -* See the :doc:`ROCm Stanford Megatron-LM installation guide ` to install and get started. +It provides efficient tensor, pipeline, and sequence-based model parallelism for +pre-training transformer-based language models such as GPT (Decoder Only), BERT +(Encoder Only), and T5 (Encoder-Decoder). -.. note:: - - Stanford Megatron-LM is supported on ROCm 6.3.0. - - -Supported Devices +Support overview ================================================================================ -- **Officially Supported**: AMD Instinct MI300X -- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210 +- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM + `__ repository, which differs from the + `https://github.com/stanford-futuredata/Megatron-LM `__ upstream repository. +- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image `, + which includes ROCm, Stanford Megatron-LM, and all required dependencies. + + - See the :doc:`ROCm Stanford Megatron-LM installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +Stanford Megatron-LM is supported on `ROCm 6.3.0 `__. + +Supported devices +-------------------------------------------------------------------------------- + +- **Officially Supported**: AMD Instinct™ MI300X +- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210 Supported models and features -================================================================================ +-------------------------------------------------------------------------------- This section details models & features that are supported by the ROCm version on Stanford Megatron-LM. Models: -* Bert +* BERT * GPT * T5 * ICT @@ -54,13 +70,24 @@ Features: Use cases and recommendations ================================================================================ -See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog `_ post -to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. -Coverage includes: +The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs: - * Single-GPU pre-training - * Multi-GPU pre-training +* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs + `__ + blog post guides how to leverage the ROCm platform for pre-training using the + Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts + (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it + demonstrates how block-sparse computations can enhance scalability and efficiency in MoE + training. The guide provides step-by-step instructions for setting up the environment, + including cloning the repository, building the Docker image, and running the training container. + Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training + language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE + training workflows for large-scale transformer models. +It features how to pre-process datasets and how to begin pre-training on AMD GPUs through: + +* Single-GPU pre-training +* Multi-GPU pre-training .. _megatron-lm-docker-compat: @@ -71,10 +98,9 @@ Docker image compatibility -AMD validates and publishes `Stanford Megatron-LM images `_ +AMD validates and publishes `Stanford Megatron-LM images `_ with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated -inventories represent the latest Megatron-LM version from the official Docker Hub. -The Docker images have been validated for `ROCm 6.3.0 `_. +inventories represent the latest Stanford Megatron-LM version from the official Docker Hub. Click |docker-icon| to view the image on Docker Hub. .. list-table:: @@ -82,6 +108,7 @@ Click |docker-icon| to view the image on Docker Hub. :class: docker-image-compatibility * - Docker image + - ROCm - Stanford Megatron-LM - PyTorch - Ubuntu @@ -91,6 +118,7 @@ Click |docker-icon| to view the image on Docker Hub. + - `6.3.0 `_ - `85f95ae `_ - `2.4.0 `_ - 24.04 diff --git a/docs/compatibility/ml-compatibility/taichi-compatibility.rst b/docs/compatibility/ml-compatibility/taichi-compatibility.rst index 5fb2b9708..3da4a3776 100644 --- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst +++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: Taichi compatibility - :keywords: GPU, Taichi compatibility + :keywords: GPU, Taichi, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -19,28 +19,52 @@ Taichi is widely used across various domains, including real-time physical simul numerical computing, augmented reality, artificial intelligence, computer vision, robotics, visual effects in film and gaming, and general-purpose computing. -* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi `_ repository. -* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev `_ upstream repository. -* Use the prebuilt :ref:`Docker image ` with ROCm, PyTorch, and Taichi preinstalled. -* See the :doc:`ROCm Taichi installation guide ` to install and get started. +Support overview +================================================================================ -.. note:: +- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi + `__ repository, which differs from the + `https://github.com/taichi-dev/taichi `__ upstream repository. - Taichi is supported on ROCm 6.3.2. +- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image `, + which includes ROCm, Taichi, and all required dependencies. -Supported devices and features -=============================================================================== -There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI. -AMD Instinct MI300X Series GPUs will be supported by November. + - See the :doc:`ROCm Taichi installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `Installation guide `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +Taichi is supported on `ROCm 6.3.2 `__. + +Supported devices +-------------------------------------------------------------------------------- + +- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI) +- **Upcoming Support**: AMD Instinct™ MI300X .. _taichi-recommendations: Use cases and recommendations ================================================================================ -To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators. -A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples `_ repository, -providing practical insights and foundational knowledge for working with the Taichi programming language. -You can also refer to the `AMD ROCm blog `_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs. + +* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs + `__ + blog highlights Taichi as an open-source programming language designed for high-performance + numerical computation, particularly in domains like real-time physical simulation, + artificial intelligence, computer vision, robotics, and visual effects. Taichi + is embedded in Python and uses just-in-time (JIT) compilation frameworks like + LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility + of Taichi in enabling complex simulations and numerical algorithms, making + it ideal for developers working on compute-intensive tasks. Developers are + encouraged to follow recommended coding patterns and utilize Taichi decorators + for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples + `_ repository. Prebuilt Docker images + integrating ROCm, PyTorch, and Taichi are provided for simplified installation + and deployment, making it easier to leverage Taichi for advanced computational workloads. .. _taichi-docker-compat: @@ -52,9 +76,8 @@ Docker image compatibility AMD validates and publishes ready-made `ROCm Taichi Docker images `_ -with ROCm backends on Docker Hub. The following Docker image tags and associated inventories +with ROCm backends on Docker Hub. The following Docker image tag and associated inventories represent the latest Taichi version from the official Docker Hub. -The Docker images have been validated for `ROCm 6.3.2 `_. Click |docker-icon| to view the image on Docker Hub. .. list-table:: diff --git a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst index 4f48d9f4c..485980d13 100644 --- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst +++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: TensorFlow compatibility - :keywords: GPU, TensorFlow compatibility + :keywords: GPU, TensorFlow, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -12,37 +12,33 @@ TensorFlow compatibility `TensorFlow `__ is an open-source library for solving machine learning, deep learning, and AI problems. It can solve many -problems across different sectors and industries but primarily focuses on -neural network training and inference. It is one of the most popular and -in-demand frameworks and is very active in open-source contribution and -development. +problems across different sectors and industries, but primarily focuses on +neural network training and inference. It is one of the most popular deep +learning frameworks and is very active in open-source development. + +Support overview +================================================================================ + +- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream + `__ repository, which differs from the + `https://github.com/tensorflow/tensorflow `__ upstream repository. + +- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images `, + which include ROCm, TensorFlow, and all required dependencies. + + - See the :doc:`ROCm TensorFlow installation guide ` + for installation and setup instructions. + + - You can also consult the `TensorFlow API versions `__ list + for additional context. + +Version support +-------------------------------------------------------------------------------- The `official TensorFlow repository `__ includes full ROCm support. AMD maintains a TensorFlow `ROCm repository `__ in order to quickly add bug -fixes, updates, and support for the latest ROCM versions. - -- ROCm TensorFlow release: - - - Offers :ref:`Docker images ` with - ROCm and TensorFlow pre-installed. - - - ROCm TensorFlow repository: ``__ - - - See the :doc:`ROCm TensorFlow installation guide ` - to get started. - -- Official TensorFlow release: - - - Official TensorFlow repository: ``__ - - - See the `TensorFlow API versions `__ list. - - .. note:: - - The official TensorFlow documentation does not cover ROCm support. Use the - ROCm documentation for installation instructions for Tensorflow on ROCm. - See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`. +fixes, updates, and support for the latest ROCm versions. .. _tensorflow-docker-compat: diff --git a/docs/compatibility/ml-compatibility/verl-compatibility.rst b/docs/compatibility/ml-compatibility/verl-compatibility.rst index 0351384e5..d4936a0ec 100644 --- a/docs/compatibility/ml-compatibility/verl-compatibility.rst +++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst @@ -2,7 +2,7 @@ .. meta:: :description: verl compatibility - :keywords: GPU, verl compatibility + :keywords: GPU, verl, deep learning, framework compatibility .. version-set:: rocm_version latest @@ -10,24 +10,58 @@ verl compatibility ******************************************************************************* -Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). -verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support. +Volcano Engine Reinforcement Learning for LLMs (`verl `__) +is a reinforcement learning framework designed for large language models (LLMs). +verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model +that makes it easy to define and run complex post-training dataflows efficiently. -* See the `verl documentation `_ for more information about verl. -* The official verl GitHub repository is `https://github.com/volcengine/verl `_. -* Use the AMD-validated :ref:`Docker images ` with ROCm and verl preinstalled. -* See the :doc:`ROCm verl installation guide ` to install and get started. +Its modular APIs separate computation from data, allowing smooth integration with other frameworks. +It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes. +verl achieves high training and generation throughput by building on existing LLM frameworks. +Its 3D-HybridEngine reduces memory use and communication overhead when switching between training +and inference, improving overall performance. -.. note:: +Support overview +================================================================================ - verl is supported on ROCm 6.2.0. +- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl + `__ repository, which differs from the + `https://github.com/volcengine/verl `__ upstream repository. + +- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image `, + which includes ROCm, verl, and all required dependencies. + + - See the :doc:`ROCm verl installation guide ` + for installation and setup instructions. + + - You can also consult the upstream `verl documentation `__ + for additional context. + +Version support +-------------------------------------------------------------------------------- + +verl is supported on `ROCm 6.2.0 `__. + +Supported devices +-------------------------------------------------------------------------------- + +**Officially Supported**: AMD Instinct™ MI300X .. _verl-recommendations: Use cases and recommendations ================================================================================ -The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration `_ blog. +* The benefits of verl in large-scale reinforcement learning from human feedback + (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD + GPUs with verl and ROCm Integration `__ + blog. The blog post outlines how the Volcano Engine Reinforcement Learning + (verl) framework integrates with the AMD ROCm platform to optimize training on + Instinct™ MI300X GPUs. The guide details the process of building a Docker image, + setting up single-node and multi-node training environments, and highlights + performance benchmarks demonstrating improved throughput and convergence accuracy. + This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, + facilitating efficient RLHF training workflows. .. _verl-supported_features: @@ -61,8 +95,10 @@ Docker image compatibility -AMD validates and publishes ready-made `ROCm verl Docker images `_ -with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub. +AMD validates and publishes ready-made `verl Docker images `_ +with ROCm backends on Docker Hub. The following Docker image tag and associated inventories +represent the latest verl version from the official Docker Hub. +Click |docker-icon| to view the image on Docker Hub. .. list-table:: :header-rows: 1 From cb8d21a0df6069a7b665319f824b0e95703b2c40 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Wed, 22 Oct 2025 12:54:25 -0400 Subject: [PATCH 06/15] Updates to the vLLM optimization guide for MI300X/MI355X (#5554) * Expand vLLM optimization guide for MI300X/MI355X with comprehensive AITER coverage. attention backend selection, environment variables (HIP/RCCL/Quick Reduce), parallelism strategies, quantization (FP8/FP4), engine tuning, CUDA graph modes, and multi-node scaling. Co-authored-by: PinSiang Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: pinsiangamd Co-authored-by: Jeffrey Novotny --- .wordlist.txt | 12 + .../vllm-optimization.rst | 1139 +++++++++++++++++ .../inference-optimization/workload.rst | 483 +------ docs/sphinx/_toc.yml.in | 2 + 4 files changed, 1208 insertions(+), 428 deletions(-) create mode 100644 docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst diff --git a/.wordlist.txt b/.wordlist.txt index 294016553..aed9dc1cc 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -27,6 +27,7 @@ ASICs ASan ASAN ASm +Async ATI atomicRMW AddressSanitizer @@ -133,6 +134,7 @@ ELMo ENDPGM EPYC ESXi +EP EoS etcd fas @@ -184,6 +186,7 @@ GPR GPT GPU GPU's +GPUDirect GPUs GraphBolt GraphSage @@ -302,6 +305,7 @@ Makefiles Matplotlib Matrox MaxText +MBT Megablocks Megatrends Megatron @@ -311,6 +315,7 @@ Meta's Miniconda MirroredStrategy Mixtral +MLA MosaicML MoEs Mooncake @@ -353,6 +358,7 @@ OFED OMM OMP OMPI +OOM OMPT OMPX ONNX @@ -398,6 +404,7 @@ Profiler's PyPi Pytest PyTorch +QPS Qcycles Qwen RAII @@ -673,6 +680,7 @@ denoised denoises denormalize dequantization +dequantized dequantizes deserializers detections @@ -788,6 +796,7 @@ linalg linearized linter linux +llm llvm lm localscratch @@ -838,6 +847,7 @@ passthrough pe perfcounter performant +piecewise perl pragma pre @@ -984,6 +994,7 @@ tokenizer tokenizes toolchain toolchains +topk toolset toolsets torchtitan @@ -1011,6 +1022,7 @@ USM UTCL UTIL utils +UX vL variational vdi diff --git a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst new file mode 100644 index 000000000..d1e0f96da --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst @@ -0,0 +1,1139 @@ +.. meta:: + :description: Learn about vLLM V1 inference tuning on AMD Instinct GPUs for optimal performance. + :keywords: AMD, Instinct, MI300X, HPC, tuning, BIOS settings, NBIO, ROCm, + environment variable, performance, HIP, Triton, PyTorch TunableOp, vLLM, RCCL, + MIOpen, GPU, resource utilization + +.. _mi300x-vllm-optimization: +.. _vllm-optimization: + +******************************** +vLLM V1 performance optimization +******************************** + +This guide helps you maximize vLLM throughput and minimize latency on AMD +Instinct MI300X, MI325X, MI350X, and MI355X GPUs. Learn how to: + +* Enable AITER (AI Tensor Engine for ROCm) for speedups on LLM models. +* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance. +* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton). +* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments. +* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss. +* Tune engine arguments (batch size, memory utilization, graph modes) for your use case. +* Benchmark and scale across single-node and multi-node configurations. + +Performance environment variables +================================= + +The following variables are generally useful for Instinct MI300X/MI355X GPUs and vLLM: + +* **HIP and math libraries** + + * ``export HIP_FORCE_DEV_KERNARG=1`` — improves kernel launch performance by + forcing device kernel arguments. This is already set by default in + :doc:`vLLM ROCm Docker images + `. Bare-metal users + should set this manually. + * ``export TORCH_BLAS_PREFER_HIPBLASLT=1`` — explicitly prefers hipBLASLt + over hipBLAS for GEMM operations. By default, PyTorch uses heuristics to + choose the best BLAS library. Setting this can improve linear layer + performance in some workloads. + +* **RCCL (collectives for multi-GPU)** + + * ``export NCCL_MIN_NCHANNELS=112`` — increases RCCL channels from default + (typically 32-64) to 112 on the Instinct MI300X. **Only beneficial for + multi-GPU distributed workloads** (tensor parallelism, pipeline + parallelism). Single-GPU inference does not need this. + +AITER (AI Tensor Engine for ROCm) switches +========================================== + +AITER (AI Tensor Engine for ROCm) provides ROCm-specific fused kernels optimized for Instinct MI350 Series and MI300X GPUs in vLLM V1. + +How AITER flags work: + +* ``VLLM_ROCM_USE_AITER`` is the master switch (defaults to ``False``/``0``). +* Individual feature flags (``VLLM_ROCM_USE_AITER_LINEAR``, ``VLLM_ROCM_USE_AITER_MOE``, and so on) default to ``True`` but only activate when the master switch is enabled. +* To enable a specific AITER feature, you must set both ``VLLM_ROCM_USE_AITER=1`` and the specific feature flag to ``1``. + +Quick start examples: + +.. code-block:: bash + + # Enable all AITER optimizations (recommended for most workloads) + export VLLM_ROCM_USE_AITER=1 + vllm serve MODEL_NAME + + # Enable only AITER Triton Prefill-Decode (split) attention + export VLLM_ROCM_USE_AITER=1 + export VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 + export VLLM_ROCM_USE_AITER_MHA=0 + vllm serve MODEL_NAME + + # Disable AITER entirely (i.e, use vLLM Triton Unified Attention Kernel) + export VLLM_ROCM_USE_AITER=0 + vllm serve MODEL_NAME + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Environment variable + - Description (default behavior) + + * - ``VLLM_ROCM_USE_AITER`` + - Master switch to enable AITER kernels (``0``/``False`` by default). All other ``VLLM_ROCM_USE_AITER_*`` flags require this to be set to ``1``. + + * - ``VLLM_ROCM_USE_AITER_LINEAR`` + - Use AITER quantization operators + GEMM for linear layers (defaults to ``True`` when AITER is on). Accelerates matrix multiplications in all transformer layers. **Recommended to keep enabled**. + + * - ``VLLM_ROCM_USE_AITER_MOE`` + - Use AITER fused-MoE kernels (defaults to ``True`` when AITER is on). Accelerates Mixture-of-Experts routing and computation. See the note on :ref:`AITER MoE requirements `. + + * - ``VLLM_ROCM_USE_AITER_RMSNORM`` + - Use AITER RMSNorm kernels (defaults to ``True`` when AITER is on). Accelerates normalization layers. **Recommended: keep enabled.** + + * - ``VLLM_ROCM_USE_AITER_MLA`` + - Use AITER Multi-head Latent Attention for supported models, for example, DeepSeek-V3/R1 (defaults to ``True`` when AITER is on). See the section on :ref:`AITER MLA requirements `. + + * - ``VLLM_ROCM_USE_AITER_MHA`` + - Use AITER Multi-Head Attention kernels (defaults to ``True`` when AITER is on; set to ``0`` to use Triton attention backends and Prefill-Decode attention backend instead). See :ref:`attention backend selection `. + + * - ``VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION`` + - Enable AITER's optimized unified attention kernel (defaults to ``False``). Only takes effect when: AITER is enabled; unified attention mode is active (``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0``); and AITER MHA is disabled (``VLLM_ROCM_USE_AITER_MHA=0``). When disabled, falls back to vLLM's Triton unified attention. + + * - ``VLLM_ROCM_USE_AITER_FP8BMM`` + - Use AITER ``FP8`` batched matmul (defaults to ``True`` when AITER is on). Fuses ``FP8`` per-token quantization with batched GEMM (used in MLA models like DeepSeek-V3). Requires an Instinct MI300X/MI355X GPU. + + * - ``VLLM_ROCM_USE_SKINNY_GEMM`` + - Prefer skinny-GEMM kernel variants for small batch sizes (defaults to ``True``). Improves performance when ``M`` dimension is small. **Recommended to keep enabled**. + + * - ``VLLM_ROCM_FP8_PADDING`` + - Pad ``FP8`` linear weight tensors to improve memory locality (defaults to ``True``). Minor memory overhead for better performance. + + * - ``VLLM_ROCM_MOE_PADDING`` + - Pad MoE weight tensors for better memory access patterns (defaults to ``True``). Same memory/performance tradeoff as ``FP8`` padding. + + * - ``VLLM_ROCM_CUSTOM_PAGED_ATTN`` + - Use custom paged-attention decode kernel when Prefill-Decode attention backend is selected (defaults to ``True``). See :ref:`Attention backend selection with AITER `. + +.. note:: + + When ``VLLM_ROCM_USE_AITER=1``, most AITER component flags (``LINEAR``, + ``MOE``, ``RMSNORM``, ``MLA``, ``MHA``, ``FP8BMM``) automatically default to + ``True``. You typically only need to set the master switch + ``VLLM_ROCM_USE_AITER=1`` to enable all optimizations. ROCm provides a + prebuilt optimized Docker image for validating the performance of LLM + inference with vLLM on MI300X Series GPUs. The Docker image includes ROCm, + vLLM, and PyTorch. For more information, see + :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`. + +.. _vllm-optimization-aiter-moe-requirements: + +AITER MoE requirements (Mixtral, DeepSeek-V2/V3, Qwen-MoE models) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``VLLM_ROCM_USE_AITER_MOE`` enables AITER's optimized Mixture-of-Experts kernels, such as expert routing (topk selection) and expert computation for better performance. + +Applicable models: + +* Mixtral series: for example, Mixtral-8x7B / Mixtral-8x22B +* Llama-4 family: for example, Llama-4-Scout-17B-16E / Llama-4-Maverick-17B-128E +* DeepSeek family: DeepSeek-V2 / DeepSeek-V3 / DeepSeek-R1 +* Qwen family: Qwen1.5-MoE / Qwen2-MoE / Qwen2.5-MoE series +* Other MoE architectures + +When to enable: + +* **Enable (default):** For all MoE models on the Instinct MI300X/MI355X for best throughput +* **Disable:** Only for debugging or if you encounter numerical issues + +Example usage: + +.. code-block:: bash + + # Standard MoE model (Mixtral) + VLLM_ROCM_USE_AITER=1 vllm serve mistralai/Mixtral-8x7B-Instruct-v0.1 + + # Hybrid MoE+MLA model (DeepSeek-V3) - requires both MOE and MLA flags + VLLM_ROCM_USE_AITER=1 vllm serve deepseek-ai/DeepSeek-V3 \ + --block-size 1 \ + --tensor-parallel-size 8 + +.. _vllm-optimization-aiter-mla-requirements: + +AITER MLA requirements (DeepSeek-V3/R1 models) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``VLLM_ROCM_USE_AITER_MLA`` enables AITER MLA (Multi-head Latent Attention) optimization for supported models. Defaults to **True** when AITER is on. + +Critical requirement: + +* **Must** explicitly set ``--block-size 1`` + +.. important:: + + If you omit ``--block-size 1``, vLLM will raise an error rather than defaulting to 1. + +Applicable models: + +* DeepSeek-V3 / DeepSeek-R1 +* DeepSeek-V2 +* Other models using multi-head latent attention (MLA) architecture + +Example usage: + +.. code-block:: bash + + # DeepSeek-R1 with AITER MLA (requires 8 GPUs) + VLLM_ROCM_USE_AITER=1 vllm serve deepseek-ai/DeepSeek-R1 \ + --block-size 1 \ + --tensor-parallel-size 8 + +.. _vllm-optimization-aiter-backend-selection: + +Attention backend selection with AITER +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Understanding which attention backend to use helps optimize your deployment. + +Quick reference: Which attention backend will I get? + +Default behavior (no configuration) + +Without setting any environment variables, vLLM uses: + +* **vLLM Triton Unified Attention** — A single Triton kernel handling both prefill and decode phases +* Works on all ROCm platforms +* Good baseline performance + +**Recommended**: Enable AITER (set ``VLLM_ROCM_USE_AITER=1``) + +When you enable AITER, the backend is automatically selected based on your model: + +.. code-block:: text + + Is your model using MLA architecture? (DeepSeek-V3/R1/V2) + ├─ YES → AITER MLA Backend + │ • Requires --block-size 1 + │ • Best performance for MLA models + │ • Automatically selected + │ + └─ NO → AITER MHA Backend + • For standard transformer models (Llama, Mistral, etc.) + • Optimized for Instinct MI300X/MI355X + • Automatically selected + +**Advanced**: Manual backend selection + +Most users won't need this, but you can override the defaults: + +.. list-table:: + :widths: 40 60 + :header-rows: 1 + + * - To use this backend + - Set these flags + + * - AITER MLA (MLA models only) + - ``VLLM_ROCM_USE_AITER=1`` (auto-selects for DeepSeek-V3/R1) + + * - AITER MHA (standard models) + - ``VLLM_ROCM_USE_AITER=1`` (auto-selects for non-MLA models) + + * - AITER Triton Prefill-Decode (split) + - | ``VLLM_ROCM_USE_AITER=1`` + | ``VLLM_ROCM_USE_AITER_MHA=0`` + | ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` + + * - vLLM Triton Unified (default) + - ``VLLM_ROCM_USE_AITER=0`` (or unset) + + * - AITER Unified Attention + - | ``VLLM_ROCM_USE_AITER=1`` + | ``VLLM_ROCM_USE_AITER_MHA=0`` + | ``VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1`` + +**Quick start examples**: + +.. code-block:: bash + + # Recommended: Standard model with AITER (Llama, Mistral, Qwen, etc.) + VLLM_ROCM_USE_AITER=1 vllm serve meta-llama/Llama-3.3-70B-Instruct + + # MLA model with AITER (DeepSeek-V3/R1) + VLLM_ROCM_USE_AITER=1 vllm serve deepseek-ai/DeepSeek-R1 \ + --block-size 1 \ + --tensor-parallel-size 8 + + # Advanced: Use Prefill-Decode split (for short input cases) + VLLM_ROCM_USE_AITER=1 \ + VLLM_ROCM_USE_AITER_MHA=0 \ + VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 \ + vllm serve meta-llama/Llama-3.3-70B-Instruct + +**Which backend should I choose?** + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Your use case + - Recommended backend + + * - **Standard transformer models** (Llama, Mistral, Qwen, Mixtral) + - **AITER MHA** (``VLLM_ROCM_USE_AITER=1``) — **Recommended for most workloads** on Instinct MI300X/MI355X. Provides optimized attention kernels for both prefill and decode phases. + + * - **MLA models** (DeepSeek-V3/R1/V2) + - **AITER MLA** (auto-selected with ``VLLM_ROCM_USE_AITER=1``) — Required for optimal performance, must use ``--block-size 1`` + + * - **gpt-oss models** (gpt-oss-120b/20b) + - **AITER Unified Attention** (``VLLM_ROCM_USE_AITER=1``, ``VLLM_ROCM_USE_AITER_MHA=0``, ``VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1``) — Required for optimal performance + + * - **Debugging or compatibility** + - **vLLM Triton Unified** (default with ``VLLM_ROCM_USE_AITER=0``) — Generic fallback, works everywhere + +**Important notes:** + +* **AITER MHA and AITER MLA are mutually exclusive** — vLLM automatically detects MLA models and selects the appropriate backend +* **For 95% of users:** Simply set ``VLLM_ROCM_USE_AITER=1`` and let vLLM choose the right backend +* When in doubt, start with AITER enabled (the recommended configuration) and profile your specific workload + +Backend choice quick recipes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* **Standard transformers (any prompt length):** Start with ``VLLM_ROCM_USE_AITER=1`` → AITER MHA. For CUDA graph modes, see architecture-specific guidance below (Dense vs MoE models have different optimal modes). +* **Latency-sensitive chat (low TTFT):** keep ``--max-num-batched-tokens`` ≤ **8k–16k** with AITER. +* **Streaming decode (low ITL):** raise ``--max-num-batched-tokens`` to **32k–64k**. +* **Offline max throughput:** ``--max-num-batched-tokens`` ≥ **32k** with ``cudagraph_mode=FULL``. + +**How to verify which backend is active** + +Check vLLM's startup logs to confirm which attention backend is being used: + +.. code-block:: bash + + # Start vLLM and check logs + VLLM_ROCM_USE_AITER=1 vllm serve meta-llama/Llama-3.3-70B-Instruct 2>&1 | grep -i attention + +**Expected log messages:** + +* AITER MHA: ``Using Aiter Flash Attention backend on V1 engine.`` +* AITER MLA: ``Using AITER MLA backend on V1 engine.`` +* vLLM Triton MLA: ``Using Triton MLA backend on V1 engine.`` +* vLLM Triton Unified: ``Using Triton Attention backend on V1 engine.`` +* AITER Triton Unified: ``Using Aiter Unified Attention backend on V1 engine.`` +* AITER Triton Prefill-Decode: ``Using Rocm Attention backend on V1 engine.`` + +Attention backend technical details +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section provides technical details about vLLM's attention backends on ROCm. + +vLLM V1 on ROCm provides these attention implementations: + +1. **vLLM Triton Unified Attention** (default when AITER is **off**) + + * Single unified Triton kernel handling both chunked prefill and decode phases + * Generic implementation that works across all ROCm platforms + * Good baseline performance + * Automatically selected when ``VLLM_ROCM_USE_AITER=0`` (or unset) + * Supports GPT-OSS + +2. **AITER Triton Unified Attention** (advanced, requires manual configuration) + + * The AMD optimized unified Triton kernel + * Enable with ``VLLM_ROCM_USE_AITER=1``, ``VLLM_ROCM_USE_AITER_MHA=0``, and ``VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1``. + * Only useful for specific workloads. Most users should use AITER MHA instead. + * Recommended this backend when running GPT-OSS. + +3. **AITER Triton Prefill–Decode Attention** (hybrid, Instinct MI300X-optimized) + + * Enable with ``VLLM_ROCM_USE_AITER=1``, ``VLLM_ROCM_USE_AITER_MHA=0``, and ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` + * Uses separate kernels for prefill and decode phases: + + * **Prefill**: ``context_attention_fwd`` Triton kernel + * **Primary decode**: ``torch.ops._rocm_C.paged_attention`` (custom ROCm kernel optimized for head sizes 64/128, block sizes 16/32, GQA 1–16, context ≤131k; sliding window not supported) + * **Fallback decode**: ``kernel_paged_attention_2d`` Triton kernel when shapes don't meet primary decode requirements + + * Usually better compared to unified Triton kernels (both vLLM and AITER variants) + * Performance vs AITER MHA varies: AITER MHA is typically faster overall, but Prefill-Decode split may win in short input scenarios + * The custom paged attention decode kernel is controlled by ``VLLM_ROCM_CUSTOM_PAGED_ATTN`` (default **True**) + +4. **AITER Multi-Head Attention (MHA)** (default when AITER is **on**) + + * Controlled by ``VLLM_ROCM_USE_AITER_MHA`` (**1** = enabled) + * Best all-around performance for standard transformer models + * Automatically selected when ``VLLM_ROCM_USE_AITER=1`` and model is not MLA + +5. **vLLM Triton Multi-head Latent Attention (MLA)** (for DeepSeek-V3/R1/V2) + + * Automatically selected when ``VLLM_ROCM_USE_AITER=0`` (or unset) + +6. **AITER Multi-head Latent Attention (MLA)** (for DeepSeek-V3/R1/V2) + + * Controlled by ``VLLM_ROCM_USE_AITER_MLA`` (``1`` = enabled) + * Required for optimal performance on MLA architecture models + * Automatically selected when ``VLLM_ROCM_USE_AITER=1`` and model uses MLA + * Requires ``--block-size 1`` + +Quick Reduce (large all-reduces on ROCm) +======================================== + +**Quick Reduce** is an alternative to RCCL/custom all-reduce for **large** inputs (MI300-class GPUs). +It supports FP16/BF16 as well as symmetric INT8/INT6/INT4 quantized all-reduce (group size 32). + +.. warning:: + + Quantization can affect accuracy. Validate quality before deploying. + +Control via: + +* ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION`` ∈ ``["NONE","FP","INT8","INT6","INT4"]`` (default ``NONE``). +* ``VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16``: cast BF16 input to FP16 (``1/True`` by default for performance). +* ``VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB``: cap the preset buffer (default ``NONE`` ≈ ``2048`` MB). + +Quick Reduce tends to help **throughput** at higher TP counts (for example, 4–8) with many concurrent requests. + +Parallelism strategies (run vLLM on multiple GPUs) +================================================== + +vLLM supports the following parallelism strategies: + +1. Tensor parallelism +2. Pipeline parallelism +3. Data parallelism +4. Expert parallelism + +For more details, see `Parallelism and scaling `_. + +**Choosing the right strategy:** + +* **Tensor Parallelism (TP)**: Use when model doesn't fit on one GPU. Prefer staying within a single XGMI island (≤8 GPUs on the Instinct MI300X). +* **Pipeline Parallelism (PP)**: Use for very large models across nodes. Set TP to GPUs per node, scale with PP across nodes. +* **Data Parallelism (DP)**: Use when model fits on single GPU or TP group, and you need higher throughput. Combine with TP/PP for large models. +* **Expert Parallelism (EP)**: Use for MoE models with ``--enable-expert-parallel``. More efficient than TP for MoE layers. + +Tensor parallelism +^^^^^^^^^^^^^^^^^^ + +Tensor parallelism splits each layer of the model weights across multiple GPUs when the model doesn't fit on a single GPU. This is primarily for memory capacity. + +**Use tensor parallelism when:** + +* Model does not fit on one GPU (OOM) +* Need to enable larger batch sizes by distributing KV cache across GPUs + +**Examples:** + +.. code-block:: bash + + # Tensor parallelism: Split model across 2 GPUs + vllm serve /path/to/model --dtype float16 --tensor-parallel-size 2 + + # Combining TP and two vLLM instance, each split across 2 GPUs (4 GPUs total) + CUDA_VISIBLE_DEVICES=0,1 vllm serve /path/to/model --dtype float16 --tensor-parallel-size 2 --port 8000 + CUDA_VISIBLE_DEVICES=2,3 vllm serve /path/to/model --dtype float16 --tensor-parallel-size 2 --port 8001 + +.. note:: + **ROCm GPU visibility:** vLLM on ROCm reads ``CUDA_VISIBLE_DEVICES``. Keep ``HIP_VISIBLE_DEVICES`` unset to avoid conflicts. + +.. tip:: + For structured data parallelism deployments with load balancing, see :ref:`data-parallelism-section`. + +Pipeline parallelism +^^^^^^^^^^^^^^^^^^^^ + +Pipeline parallelism splits the model's layers across multiple GPUs or nodes, with each GPU processing different layers sequentially. This is primarily used for multi-node deployments where the model is too large for a single node. + +**Use pipeline parallelism when:** + +* Model is too large for a single node (combine PP with TP) +* GPUs on a node lack high-speed interconnect (e.g., no NVLink/XGMI) - PP may perform better than TP +* GPU count doesn't evenly divide the model (PP supports uneven splits) + +**Common pattern for multi-node:** + +.. code-block:: bash + + # 2 nodes × 8 GPUs = 16 GPUs total + # TP=8 per node, PP=2 across nodes + vllm serve meta-llama/Llama-3.1-405B-Instruct \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 + +.. note:: + **ROCm best practice**: On the Instinct MI300X, prefer staying within a single XGMI island (≤8 GPUs) using TP only. Use PP when scaling beyond eight GPUs or across nodes. + +.. _data-parallelism-section: + +Data parallelism +^^^^^^^^^^^^^^^^ + +Data parallelism replicates model weights across separate instances/GPUs to process independent batches of requests. This approach increases throughput by distributing the workload across multiple replicas. + +**Use data parallelism when:** + +* Model fits on one GPU, but you need higher request throughput +* Scaling across multiple nodes horizontally +* Combining with tensor parallelism (for example, DP=2 + TP=4 = 8 GPUs total) + +**Quick start - single-node:** + +.. code-block:: bash + + # Model fit in 1 GPU. Creates 2 model replicas (requires 2 GPUs) + VLLM_ALL2ALL_BACKEND="allgather_reducescatter" vllm serve /path/to/model \ + --data-parallel-size 2 \ + --disable-nccl-for-dp-synchronization + +.. tip:: + For ROCm, currently use ``VLLM_ALL2ALL_BACKEND="allgather_reducescatter"`` and ``--disable-nccl-for-dp-synchronization`` with data parallelism. + +Choosing a load balancing strategy +""""""""""""""""""""""""""""""""""" + +vLLM supports two modes for routing requests to DP ranks: + +.. list-table:: + :header-rows: 1 + :widths: 30 35 35 + + * - + - **Internal LB** (recommended) + - **External LB** + * - **HTTP endpoints** + - 1 endpoint, vLLM routes internally + - N endpoints, you provide external router + * - **Single-node config** + - ``--data-parallel-size N`` + - ``--data-parallel-size N --data-parallel-rank 0..N-1`` + different ports + * - **Multi-node config** + - ``--data-parallel-size``, ``--data-parallel-size-local``, ``--data-parallel-address`` + - ``--data-parallel-size N --data-parallel-rank 0..N-1`` + ``--data-parallel-address`` + * - **Client view** + - Single URL/port + - Multiple URLs/ports + * - **Load balancer** + - Built-in (vLLM handles) + - External (Nginx, Kong, K8s Service) + * - **Coordination** + - DP ranks sync via RPC (for MoE/MLA) + - DP ranks sync via RPC (for MoE/MLA) + * - **Best for** + - Most deployments (simpler) + - K8s/cloud environments with existing LB + +.. tip:: + **Dense (non-MoE) models only:** You can run fully independent ``vllm serve`` instances without any DP flags, using your own load balancer. This avoids RPC coordination overhead entirely. + +For more technical details, see `vLLM Data Parallel Deployment `_ + +Data Parallel Attention (advanced) +"""""""""""""""""""""""""""""""""" + +For models with Multi-head Latent Attention (MLA) architecture like DeepSeek V2, V3, and R1, vLLM supports **Data Parallel Attention**, +which provides request-level parallelism instead of model replication. This avoids KV cache duplication across tensor parallel ranks, +significantly reducing memory usage and enabling larger batch sizes. + +**Key benefits for MLA models:** + +* Eliminates KV cache duplication when using tensor parallelism +* Enables higher throughput for high-QPS serving scenarios +* Better memory efficiency for large context windows + +**Usage with Expert Parallelism:** + +Data parallel attention works seamlessly with Expert Parallelism for MoE models: + +.. code-block:: bash + + # DeepSeek-R1 with DP attention and expert parallelism + VLLM_ALL2ALL_BACKEND="allgather_reducescatter" vllm serve deepseek-ai/DeepSeek-R1 \ + --data-parallel-size 8 \ + --enable-expert-parallel \ + --disable-nccl-for-dp-synchronization + +For more technical details, see `vLLM RFC #16037 `_. + +Expert parallelism +^^^^^^^^^^^^^^^^^^ + +Expert parallelism (EP) distributes expert layers of Mixture-of-Experts (MoE) models across multiple GPUs, +where tokens are routed to the GPUs holding the experts they need. + +**Performance considerations:** + +Expert parallelism is designed primarily for cross-node MoE deployments where high-bandwidth interconnects (like InfiniBand) between nodes make EP communication efficient. For single-node Instinct MI300X/MI355X deployments with XGMI connectivity, tensor parallelism typically provides better performance due to optimized all-to-all collectives on XGMI. + +**When to use EP:** + +* Multi-node MoE deployments with fast inter-node networking +* Models with very large numbers of experts that benefit from expert distribution +* Workloads where EP's reduced data movement outweighs communication overhead + +**Single-node recommendation:** For Instinct MI300X/MI355X within a single node (≤8 GPUs), prefer tensor parallelism over expert parallelism for MoE models to leverage XGMI's high bandwidth and low latency. + +**Basic usage:** + +.. code-block:: bash + + # Enable expert parallelism for MoE models (DeepSeek example with 8 GPUs) + vllm serve deepseek-ai/DeepSeek-R1 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel + +**Combining with Tensor Parallelism:** + +When EP is enabled alongside tensor parallelism: + +* Fused MoE layers use expert parallelism +* Non-fused MoE layers use tensor parallelism + +**Combining with Data Parallelism:** + +EP works seamlessly with Data Parallel Attention for optimal memory efficiency in MLA+MoE models (for example, DeepSeek V3): + +.. code-block:: bash + + # DP attention + EP for DeepSeek-R1 + VLLM_ALL2ALL_BACKEND="allgather_reducescatter" vllm serve deepseek-ai/DeepSeek-R1 \ + --data-parallel-size 8 \ + --enable-expert-parallel \ + --disable-nccl-for-dp-synchronization + +Throughput benchmarking +======================= + +This guide evaluates LLM inference by tokens per second (TPS). vLLM provides a +built-in benchmark: + +.. code-block:: bash + + # Synthetic or dataset-driven benchmark + + vllm bench throughput --model /path/to/model [other args] + +* **Real-world dataset** (ShareGPT) example: + + .. code-block:: bash + + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + vllm bench throughput --model /path/to/model --dataset /path/to/ShareGPT_V3_unfiltered_cleaned_split.json + +* **Synthetic**: set fixed ``--input-len`` and ``--output-len`` for reproducible runs. + +.. tip:: + + **Profiling checklist (ROCm)** + + 1. Fix your prompt distribution (ISL/OSL) and **vary one knob at a time** (graph mode, MBT). + 2. Measure **TTFT**, **ITL**, and **TPS** together; don't optimize one in isolation. + 3. Compare graph modes: **PIECEWISE** (balanced) vs **FULL**/``FULL_DECODE_ONLY`` (max throughput). + 4. Sweep ``--max-num-batched-tokens`` around **8k–64k** to find your latency/throughput balance. + +Maximizing instances per node +============================= + +To maximize **per-node throughput**, run as many vLLM instances as model memory allows, +balancing KV-cache capacity. + +* **HBM capacities**: MI300X = 192 GB HBM3; MI355X = 288 GB HBM3E. + +* Up to **eight** single-GPU vLLM instances can run in parallel on an 8×GPU node (one per GPU): + + .. code-block:: bash + + for i in $(seq 0 7); do + CUDA_VISIBLE_DEVICES="$i" vllm bench throughput + -tp 1 --model /path/to/model + --dataset /path/to/ShareGPT_V3_unfiltered_cleaned_split.json & + done + +Total throughput from **N** single-GPU instances usually exceeds one instance stretched across **N** GPUs (``-tp N``). + +**Model coverage**: Llama 2 (7B/13B/70B), Llama 3 (8B/70B), Qwen2 (7B/72B), Mixtral-8x7B/8x22B, and others Llama2‑70B +and Llama3‑70B can fit a single MI300X/MI355X; Llama3.1‑405B fits on a single 8×MI300X/MI355X node. + +Configure the gpu-memory-utilization parameter +================================================== + +The ``--gpu-memory-utilization`` parameter controls the fraction of GPU memory reserved for the KV-cache. The default is **0.9** (90%). + +There are two strategies: + +1. **Increase** ``--gpu-memory-utilization`` to maximize throughput for a single instance (up to **0.95**). + Example: + + .. code-block:: bash + + vllm serve meta-llama/Llama-3.3-70B-Instruct \ + --gpu-memory-utilization 0.95 \ + --max-model-len 8192 \ + --port 8000 + +2. **Decrease** to pack **multiple** instances on the same GPU (for small models like 7B/8B), keeping KV-cache viable: + + .. code-block:: bash + + # Instance 1 on GPU 0 + CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --gpu-memory-utilization 0.45 \ + --max-model-len 4096 \ + --port 8000 + + # Instance 2 on GPU 0 + CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Llama-Guard-3-8B \ + --gpu-memory-utilization 0.45 \ + --max-model-len 4096 \ + --port 8001 + +vLLM engine arguments +===================== + +Selected arguments that often help on ROCm. See `engine args docs `_ for the full list. + +Configure --max-num-seqs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default value is **1024** in vLLM V1 (increased from **256** in V0). This flag controls the maximum number of sequences processed per batch, directly affecting concurrency and memory usage. + +* **To increase throughput**: Raise to **2048** or **4096** if memory allows, enabling more sequences per iteration. +* **To reduce memory usage**: Lower to **256** or **128** for large models or long-context generation. For example, set ``--max-num-seqs 128`` to reduce concurrency and lower memory requirements. + +In vLLM V1, KV-cache token requirements are computed as ``max-num-seqs * max-model-len``. + +Example usage: + +.. code-block:: bash + + vllm serve --max-num-seqs 128 --max-model-len 8192 + +Configure --max-num-batched-tokens +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Chunked prefill is enabled by default** in vLLM V1. + +* Lower values improve **ITL** (less prefill interrupting decode). +* Higher values improve **TTFT** (more prefill per batch). + +Defaults: **8192** for online serving, **16384** for offline. However, optimal values vary significantly by model size. Smaller models can efficiently handle larger batch sizes. Setting it near ``--max-model-len`` mimics V0 behavior and often maximizes throughput. + +**Guidance:** + +* **Interactive (low TTFT)**: keep MBT ≤ **8k–16k**. +* **Streaming (low ITL)**: MBT **16k–32k**. +* **Offline max throughput**: MBT **≥32k** (diminishing TPS returns beyond ~32k). + +**Pattern:** Smaller/more efficient models benefit from larger batch sizes. MoE models with expert parallelism can handle very large batches efficiently. + +**Rule of thumb** + +* Push MBT **up** to trade TTFT↑ for ITL↓ and slightly higher TPS. +* Pull MBT **down** to trade ITL↑ for TTFT↓ (interactive UX). + +Async scheduling +^^^^^^^^^^^^^^^^ + +``--async-scheduling`` (replaces deprecated ``num_scheduler_steps``) can improve throughput/ITL by trading off TTFT. +Prefer **off** for latency-sensitive serving; **on** for offline batch throughput. + +CUDA graphs configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +CUDA graphs reduce kernel launch overhead by capturing and replaying GPU operations, improving inference throughput. Configure using ``--compilation-config '{"cudagraph_mode": "MODE"}'``. + +**Available modes:** + +* ``NONE`` — CUDA graphs disabled (debugging) +* ``PIECEWISE`` — Attention stays eager, other ops use CUDA graphs (most compatible) +* ``FULL`` — Full CUDA graphs for all batches (best for small models/prompts) +* ``FULL_DECODE_ONLY`` — Full CUDA graphs only for decode (saves memory in prefill/decode split setups) +* ``FULL_AND_PIECEWISE`` — **(default)** Full graphs for decode + piecewise for prefill (best performance, highest memory) + +**Default behavior:** V1 defaults to ``FULL_AND_PIECEWISE`` with piecewise compilation enabled; otherwise ``NONE``. + +**Backend compatibility:** Not all attention backends support all CUDA graph modes. Choose a mode your backend supports: + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Attention backend + - CUDA graph support + * - vLLM/AITER Triton Unified Attention, vLLM Prefill-Decode Attention + - Full support (prefill + decode) + * - AITER MHA, AITER MLA + - Uniform batches only + * - vLLM Triton MLA + - Must exclude attention from graph — ``PIECEWISE`` required + +**Usage examples:** + +.. code-block:: bash + + # Default (best performance, highest memory) + vllm serve meta-llama/Llama-3.1-8B-Instruct + + # Decode-only graphs (lower memory, good for P/D split) + vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + + # Full graphs for offline throughput (small models) + vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --compilation-config '{"cudagraph_mode": "FULL"}' + +**Migration from legacy flags:** + +* ``use_cudagraph=False`` → ``NONE`` +* ``use_cudagraph=True, full_cuda_graph=False`` → ``PIECEWISE`` +* ``full_cuda_graph=True`` → ``FULL`` (with automatic fallback) + +Quantization support +==================== + +vLLM supports FP4/FP8 (4-bit/8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X and MI355X. +Quantization of models with FP4/FP8 allows for a **2x-4x** reduction in model memory requirements and up to a **1.6x** +improvement in throughput with minimal impact on accuracy. + +vLLM ROCm supports a variety of quantization demands: + +* On-the-fly quantization + +* Pre-quantized model through Quark and llm-compressor + +Supported quantization methods +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +vLLM on ROCm supports the following quantization methods for the AMD Instinct MI300 series and Instinct MI355X GPUs: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 15 20 30 + + * - Method + - Precision + - ROCm support + - Memory reduction + - Best use case + * - **FP8** (W8A8) + - 8-bit float + - Excellent + - 2× (50%) + - Production, balanced speed/accuracy + * - **PTPC-FP8** + - 8-bit float + - Excellent + - 2× (50%) + - High throughput, better than ``FP8`` + * - **AWQ** + - 4-bit int (W4A16) + - Good + - 4× (75%) + - Large models, memory-constrained + * - **GPTQ** + - 4-bit/8-bit int + - Good + - 2-4× (50-75%) + - Pre-quantized models available + * - **FP8 KV-cache** + - 8-bit float + - Excellent + - KV cache: 50% + - All inference workloads + * - **Quark (AMD)** + - ``FP8``/``MXFP4`` + - Optimized + - 2-4× (50-75%) + - AMD pre-quantized models + * - **compressed-tensors** + - W8A8 ``INT8``/``FP8`` + - Good + - 2× (50%) + - LLM Compressor models + +**ROCm support key:** + +- Excellent: Fully supported with optimized kernels +- Good: Supported, might not have AMD-optimized kernels +- Optimized: AMD-specific optimizations available + +Using Pre-quantized Models +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +AMD provides pre-quantized models optimized for ROCm. These models are ready to use with vLLM. + +**AMD Quark-quantized models**: + +Available on `Hugging Face `_: + +* `Llama‑3.1‑8B‑Instruct‑FP8‑KV `__ (FP8 W8A8) +* `Llama‑3.1‑70B‑Instruct‑FP8‑KV `__ (FP8 W8A8) +* `Llama‑3.1‑405B‑Instruct‑FP8‑KV `__ (FP8 W8A8) +* `Mixtral‑8x7B‑Instruct‑v0.1‑FP8‑KV `__ (FP8 W8A8) +* `Mixtral‑8x22B‑Instruct‑v0.1‑FP8‑KV `__ (FP8 W8A8) +* `Llama-3.3-70B-Instruct-MXFP4-Preview `__ (MXFP4 for MI350/MI355) +* `Llama-3.1-405B-Instruct-MXFP4-Preview `__ (MXFP4 for MI350/MI355) +* `DeepSeek-R1-0528-MXFP4-Preview `__ (MXFP4 for MI350/MI355) + +**Quick start**: + +.. code-block:: bash + + # FP8 W8A8 Quark model + vllm serve amd/Llama-3.1-8B-Instruct-FP8-KV \ + --dtype auto + + # MXFP4 Quark model for MI350/MI355 + vllm serve amd/Llama-3.3-70B-Instruct-MXFP4-Preview \ + --dtype auto \ + --tensor-parallel-size 1 + +**Other pre-quantized models**: + +- AWQ models: `Hugging Face awq flag `_ +- GPTQ models: `Hugging Face gptq flag `_ +- LLM Compressor models: `Hugging Face compressed-tensors flag `_ + +On-the-fly quantization +^^^^^^^^^^^^^^^^^^^^^^^^ + +For models without pre-quantization, vLLM can quantize ``FP16``/``BF16`` models at server startup. + +**Supported methods**: + +- ``fp8``: Per-tensor ``FP8`` weight and activation quantization +- ``ptpc_fp8``: Per-token-activation per-channel-weight ``FP8`` (better accuracy same ``FP8`` speed). See `PTPC-FP8 on ROCm blog post `_ for details + +**Usage:** + +.. code-block:: bash + + # On-the-fly FP8 quantization + vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --quantization fp8 \ + --dtype auto + + # On-the-fly PTPC-FP8 (recommended as default) + vllm serve meta-llama/Llama-3.1-70B-Instruct \ + --quantization ptpc_fp8 \ + --dtype auto \ + --tensor-parallel-size 4 + +.. note:: + + On-the-fly quantization adds two to five minutes of startup time but eliminates pre-quantization. For production with frequent restarts, use pre-quantized models. + +GPTQ +^^^^ + +GPTQ is a 4-bit/8-bit weight quantization method that compresses models with minimal accuracy loss. GPTQ +is fully supported on ROCm via HIP-compiled kernels in vLLM. + +**ROCm support status**: + +- **Fully supported** - GPTQ kernels compile and run on ROCm via HIP +- **Pre-quantized models work** with standard GPTQ kernels + +**Recommendation**: For the AMD Instinct MI300X, **AWQ with Triton kernels** or **FP8 quantization** might provide better +performance due to ROCm-specific optimizations, but GPTQ is a viable alternative. + +**Using pre-quantized GPTQ models**: + +.. code-block:: bash + + # Using pre-quantized GPTQ model on ROCm + vllm serve RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w4a16 \ + --quantization gptq \ + --dtype auto \ + --tensor-parallel-size 1 + +**Important notes**: + +- **Kernel support:** GPTQ uses standard HIP-compiled kernels on ROCm +- **Performance:** AWQ with Triton kernels might offer better throughput on AMD GPUs due to ROCm optimizations +- **Compatibility:** GPTQ models from Hugging Face work on ROCm with standard performance +- **Use case:** GPTQ is suitable when pre-quantized GPTQ models are readily available + +AWQ (Activation-aware Weight Quantization) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +AWQ (Activation-aware Weight Quantization) is a 4-bit weight quantization technique that provides excellent +model compression with minimal accuracy loss (<1%). ROCm supports AWQ quantization on the AMD Instinct MI300 series and +MI355X GPUs with vLLM. + +**Using pre-quantized AWQ models:** + +Many AWQ-quantized models are available on Hugging Face. Use them directly with vLLM: + +.. code-block:: bash + + # vLLM serve with AWQ model + VLLM_USE_TRITON_AWQ=1 \ + vllm serve hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 \ + --quantization awq \ + --tensor-parallel-size 1 \ + --dtype auto + +**Important Notes:** + +* **ROCm requirement:** Set ``VLLM_USE_TRITON_AWQ=1`` to enable Triton-based AWQ kernels on ROCm +* **dtype parameter:** AWQ requires ``--dtype auto`` or ``--dtype float16``. The ``--dtype`` flag controls + the **activation dtype** (``FP16``/``BF16`` for computations), not the weight dtype. AWQ weights remain as INT4 + (4-bit integers) as specified in the model's quantization config, but are dequantized to ``FP16``/``BF16`` during + matrix multiplication operations. +* **Group size:** 128 is recommended for optimal performance/accuracy balance +* **Model compatibility:** AWQ is primarily tested on Llama, Mistral, and Qwen model families + +Quark (AMD quantization toolkit) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +AMD Quark is the AMD quantization toolkit optimized for ROCm. It supports ``FP8 W8A8``, ``MXFP4``, ``W8A8 INT8``, and +other quantization formats with native vLLM integration. The quantization format will automatically be inferred +from the model config file, so you can omit ``--quantization quark``. + +**Running Quark Models:** + +.. code-block:: bash + + # FP8 W8A8: Single GPU + vllm serve amd/Llama-3.1-8B-Instruct-FP8-KV \ + --dtype auto \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.90 + + # MXFP4: Extreme memory efficiency + vllm serve amd/Llama-3.3-70B-Instruct-MXFP4-Preview \ + --dtype auto \ + --tensor-parallel-size 1 \ + --max-model-len 8192 + +**Key features:** + +- **FP8 models**: ~50% memory reduction, 2× compression +- **MXFP4 models**: ~75% memory reduction, 4× compression +- **Embedded scales**: Quark FP8-KV models include pre-calibrated KV-cache scales +- **Hardware optimized**: Leverages the AMD Instinct MI300 series ``FP8`` acceleration + +For creating your own Quark-quantized models, see `Quark Documentation `_. + +FP8 kv-cache dtype +^^^^^^^^^^^^^^^^^^^^ + +FP8 KV-cache quantization reduces memory footprint by approximately 50%, enabling longer context lengths +or higher concurrency. ROCm supports FP8 KV-cache with both ``fp8_e4m3`` and ``fp8_e5m2`` formats on +AMD Instinct MI300 series and other CDNA™ GPUs. + +Use ``--kv-cache-dtype fp8`` to enable ``FP8`` KV-cache quantization. For best accuracy, use calibrated +scaling factors generated via `LLM Compressor `_. +Without calibration, scales are calculated dynamically (``--calculate-kv-scales``) with minimal +accuracy impact. + + +**Quick start (dynamic scaling)**: + +.. code-block:: bash + + # vLLM serve with dynamic FP8 KV-cache + vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --kv-cache-dtype fp8 \ + --calculate-kv-scales \ + --gpu-memory-utilization 0.90 + +**Calibrated scaling (advanced)**: + +For optimal accuracy, pre-calibrate KV-cache scales using representative data. The calibration process: + +#. Runs the model on calibration data (512+ samples recommended) +#. Computes optimal ``FP8`` quantization scales for key/value cache tensors +#. Embeds these scales into the saved model as additional parameters +#. vLLM loads the model and uses the embedded scales automatically when ``--kv-cache-dtype fp8`` is specified + +The quantized model can be used like any other model. The embedded scales are stored as part of the model weights. + +**Using pre-calibrated models:** + +AMD provides ready-to-use models with pre-calibrated ``FP8`` KV cache scales: + +* `amd/Llama-3.1-8B-Instruct-FP8-KV `_ +* `amd/Llama-3.3-70B-Instruct-FP8-KV `_ + +To verify a model has pre-calibrated KV cache scales, check ``config.json`` for: + +.. code-block:: json + + "quantization_config": { + "kv_cache_scheme": "static" // Indicates pre-calibrated scales are embedded + } + +**Creating your own calibrated model:** + +.. code-block:: bash + + # 1. Install LLM Compressor + pip install llmcompressor + + # 2. Run calibration script (see llm-compressor repo for full example) + python llama3_fp8_kv_example.py + + # 3. Use calibrated model in vLLM + vllm serve ./Meta-Llama-3-8B-Instruct-FP8-KV \ + --kv-cache-dtype fp8 + +For detailed instructions and the complete calibration script, see the `FP8 KV Cache Quantization Guide `_. + +**Format options**: + +- ``fp8`` or ``fp8_e4m3``: Higher precision (default, recommended) +- ``fp8_e5m2``: Larger dynamic range, slightly lower precision + +Speculative decoding (experimental) +=================================== + +Recent vLLM versions add support for speculative decoding backends (for example, Eagle‑v3). Evaluate for your model and latency/throughput goals. +Speculative decoding is a technique to reduce latency when max number of concurrency is low. +Depending on the methods, the effective concurrency varies, for example, from 16 to 64. + +Example command: + +.. code-block:: bash + + vllm serve meta-llama/Llama-3.1-8B-Instruct \ + --trust-remote-code \ + --swap-space 16 \ + --disable-log-requests \ + --tensor-parallel-size 1 \ + --distributed-executor-backend mp \ + --dtype float16 \ + --quantization fp8 \ + --kv-cache-dtype fp8 \ + --no-enable-chunked-prefill \ + --max-num-seqs 300 \ + --max-num-batched-tokens 131072 \ + --gpu-memory-utilization 0.8 \ + --speculative_config '{"method": "eagle3", "model": "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", "num_speculative_tokens": 2, "draft_tensor_parallel_size": 1, "dtype": "float16"}' \ + --port 8001 + + +.. important:: + + It has been observed that more ``num_speculative_tokens`` causes less + acceptance rate of draft model tokens and a decline in throughput. As a + workaround, set ``num_speculative_tokens`` to <= 2. + + +Multi-node checklist and troubleshooting +======================================== + +1. Use ``--distributed-executor-backend ray`` across nodes to manage HIP-visible ranks and RCCL communicators. (``ray`` is the default for multi-node. Explicitly setting this flag is optional.) +2. Ensure ``/dev/shm`` is shared across ranks (Docker ``--shm-size``, Kubernetes ``emptyDir``), as RCCL uses shared memory for rendezvous. +3. For GPUDirect RDMA, set ``RCCL_NET_GDR_LEVEL=2`` and verify links (``ibstat``). Requires supported NICs (for example, ConnectX‑6+). +4. Collect RCCL logs: ``RCCL_DEBUG=INFO`` and optionally ``RCCL_DEBUG_SUBSYS=INIT,GRAPH`` for init/graph stalls. + +Further reading +=============== + +* :doc:`workload` +* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` diff --git a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst index 30e86e277..7cd2c7fc6 100644 --- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst +++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst @@ -15,10 +15,9 @@ using PyTorch. It delves into specific workloads such as :ref:`model inference `, offering strategies to enhance efficiency. -The following topics highlight :ref:`auto-tunable configurations ` -that streamline optimization as well as advanced techniques like -:ref:`Triton kernel optimization ` for -meticulous tuning. +The following topics highlight :ref:`auto-tunable configurations ` as +well as :ref:`Triton kernel optimization ` +for meticulous tuning. Workload tuning strategy ======================== @@ -86,23 +85,22 @@ Optimize model inference with vLLM ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ vLLM provides tools and techniques specifically designed for efficient model -inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm` -for installation guidance. Optimizing performance with vLLM -involves configuring tensor parallelism, leveraging advanced features, and -ensuring efficient execution. Here’s how to optimize vLLM performance: +inference on AMD Instinct GPUs. See the official `vLLM installation docs +`__ for +installation guidance. Optimizing performance with vLLM involves configuring +tensor parallelism, leveraging advanced features, and ensuring efficient +execution. -* Tensor parallelism: Configure the - :ref:`tensor-parallel-size parameter ` to distribute - tensor computations across multiple GPUs. Adjust parameters such as - ``batch-size``, ``input-len``, and ``output-len`` based on your workload. - -* Configuration for vLLM: Set :ref:`parameters ` - according to workload requirements. Benchmark performance to understand - characteristics and identify bottlenecks. +* Configuration for vLLM: Set engine arguments according to workload + requirements. * Benchmarking and performance metrics: Measure latency and throughput to evaluate performance. +.. seealso:: + + See :doc:`vllm-optimization`. + .. _mi300x-auto-tune: Auto-tunable configurations @@ -120,8 +118,7 @@ characteristics. For example: your specific hardware. * Triton: Use :ref:`Triton’s auto-tuning features ` - to explore various kernel configurations and automatically select the - best-performing ones. + to explore various kernel configurations and select the best-performing ones. Manual tuning ^^^^^^^^^^^^^ @@ -328,381 +325,6 @@ hardware counters are also included. ROCm Systems Profiler timeline trace example. -.. _mi300x-vllm-optimization: - -vLLM performance optimization -============================= - -vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for -its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm. - -Performance environment variables ---------------------------------- - -The following performance tips are not *specific* to vLLM -- they are general -but relevant in this context. You can tune the following vLLM parameters to -achieve optimal request latency and throughput performance. - -* As described in `Environment variables (MI300X) - `_, - the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM - performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``. - -* Set the :ref:`RCCL environment variable ` ``NCCL_MIN_NCHANNELS`` - to ``112`` to increase the number of channels on MI300X to potentially improve - performance. - -* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance. - -Auto-tuning using PyTorch TunableOp ------------------------------------- - -Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning. -You can run auto-tuning with TunableOp in two simple steps without modifying your code: - -* Enable TunableOp and tuning. Optionally, enable verbose mode: - - .. code-block:: shell - - PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh - -* Enable TunableOp and disable tuning and measure. - - .. code-block:: shell - - PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh - -Learn more about TunableOp in the :ref:`PyTorch TunableOp ` section. - -Performance tuning based on vLLM engine configurations -------------------------------------------------------- - -The following subsections describe vLLM-specific configurations for performance tuning. -You can tune the following vLLM parameters to achieve optimal performance. - -* ``tensor_parallel_size`` - -* ``gpu_memory_utilization`` - -* ``dtype`` - -* ``enforce_eager`` - -* ``kv_cache_dtype`` - -* ``input_len`` - -* ``output_len`` - -* ``max_num_seqs`` - -* ``num_scheduler_steps`` - -* ``max_model_len`` - -* ``enable_chunked_prefill`` - -* ``distributed_executor_backend`` - -* ``max_seq_len_to_capture`` - -Refer to `vLLM documentation `_ -for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM -usage with ROCm. - -ROCm provides a prebuilt optimized Docker image for validating the performance -of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes -ROCm, vLLM, and PyTorch. For more information, see -:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`. - -.. _mi300x-vllm-throughput-measurement: - -Evaluating performance by throughput measurement -------------------------------------------------- - -This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals. - -Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository `_. -Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals. - -* For realistic performance evaluation, you can use datasets like Hugging Face's - ``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational - data, making it a good representation of typical use cases for language models. Download it using - the following command: - - .. code-block:: shell - - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -* For standardized benchmarking, you can set fixed input and output token - lengths. Synthetic prompts provide consistent benchmarking runs, making it - easier to compare performance across different models or configurations. - Additionally, a controlled environment simplifies analysis. - -By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios. - -.. _mi300x-vllm-single-node: - -Maximizing vLLM instances on a single node ------------------------------------------- - -The general guideline is to maximize per-node throughput by running as many vLLM instances as possible. -However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance. - -The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth. -For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances -simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment -variable ``CUDA_VISIBLE_DEVICES``. - -For example, this script runs eight instances of vLLM for throughput benchmarking at the same time -with a model that can fit in one GPU: - -.. code-block:: shell - - for i in $(seq 0 7); - do - CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model & - done - -The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a -single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or -using the ``-tp`` N option, where ``1 < N ≤ 8``). - -vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on. -Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs. - -.. _mi300x-vllm-gpu-memory-utilization: - -Configure the gpu_memory_utilization parameter ----------------------------------------------- - -There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter. - -1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as - it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9. - You can set it to ``>0.9`` and ``<1``. - - For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%. - - .. code-block:: shell - - /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model - -2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU. - - Specify GPU memory utilization to run as many instances of vLLM as possible on a single - GPU. However, too many instances can result in no memory for KV-cache. For small models, run - multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as - long as it would not cause HIP Out Of Memory. - - For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying - ``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``): - - .. code-block:: shell - - CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 - --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model & - - CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 - --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model & - -See :ref:`vllm-engine-args` for other performance suggestions. - -.. _mi300x-vllm-multiple-gpus: - -Run vLLM on multiple GPUs -------------------------- - -The two main reasons to use multiple GPUs are: - -* The model size is too big to run vLLM using one GPU as it results HIP Out of Memory. - -* To achieve better latency when using a single GPU is not desirable. - -To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to -specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify -the GPUs. - -For example, you can use two GPUs to start an API server on port 8000: - -.. code-block:: shell - - python -m vllm.entrypoints.api_server --model /path/to/model --dtype - float16 -tp 2 --port 8000 & - -To achieve both latency and throughput performance for serving, you can run multiple API servers on -different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to -specify the GPUs for each server, for example: - -.. code-block:: shell - - CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model - /path/to/model --dtype float16 -tp 2 --port 8000 & - - CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model - /path/to/model --dtype float16 -tp 2 --port 8001 & - -Choose an attention backend ---------------------------- - -vLLM on ROCm supports two attention backends, each suitable for different use cases and performance -requirements: - -- **Triton Flash Attention** - For benchmarking, run vLLM scripts at - least once as a warm-up step so Triton can perform auto-tuning before - collecting benchmarking numbers. This is the default setting. - -- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify - the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``. - - -Refer to :ref:`Model acceleration libraries ` -to learn more about Flash Attention with Triton or CK backends. - -.. _vllm-engine-args: - -vLLM engine arguments ---------------------- - -The following are configuration suggestions to potentially improve performance with vLLM. See -`vLLM's engine arguments documentation `_ -for a full list of configurable engine arguments. - -Configure the max-num-seqs parameter -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs -512``). This increases the maximum number of sequences per iteration and can improve throughput. - -Use the float16 dtype -^^^^^^^^^^^^^^^^^^^^^ - -The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``. -Use float16 (``--dtype float16``) for better performance. - -Multi-step scheduling -^^^^^^^^^^^^^^^^^^^^^ - -Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``). - -Distributed executor backend -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the ``__ fork, using the ``mp`` -backend (``--distributed_executor_backend mp``) is recommended. - -Graph mode max-seq-len-to-capture -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length -larger than this, vLLM engine falls back to eager mode. The default is 8192. - -When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384. -See this `vLLM blog `__ for details. - -An example of long context length model is Qwen2-7b. - -Whether to enable chunked prefill -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Another vLLM performance tip is to enable chunked prefill to improve -throughput. Chunked prefill allows large prefills to be chunked into -smaller chunks and batched together with decode requests. - -You can enable the feature by specifying ``--enable-chunked-prefill`` in the -command line or setting ``enable_chunked_prefill=True`` in the LLM -constructor.  - -As stated in `vLLM's documentation, `__, -you can tune the performance by changing ``max_num_batched_tokens``. By -default, it is set to 512 and optimized for ITL (inter-token latency). -Smaller ``max_num_batched_tokens`` achieves better ITL because there are -fewer prefills interrupting decodes. -Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first -token) as you can put more prefill to the batch. - -You might experience noticeable throughput improvements when -benchmarking on a single GPU or 8 GPUs using the vLLM throughput -benchmarking script along with the ShareGPT dataset as input. - -In the case of fixed ``input-len``/``output-len``, for some configurations, -enabling chunked prefill increases the throughput. For some other -configurations, the throughput may be worse and elicit a need to tune -parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger). - -.. note:: - - Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices `_ (October 2024). - -Quantization support ---------------------- - -Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint. -``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm. - -FP8 quantization -^^^^^^^^^^^^^^^^^ - -``__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X. -Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. - -AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example: - -* `Llama-3.1-8B-Instruct-FP8-KV `__ -* `Llama-3.1-70B-Instruct-FP8-KV `__ -* `Llama-3.1-405B-Instruct-FP8-KV `__ -* `Mixtral-8x7B-Instruct-v0.1-FP8-KV `__ -* `Mixtral-8x22B-Instruct-v0.1-FP8-KV `__ - -To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``). - -AWQ quantization -^^^^^^^^^^^^^^^^ - -You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that -that AWQ support in vLLM is currently underoptimized. - -To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``). - -You can find more specifics in the `vLLM AutoAWQ documentation `_. - -fp8 kv-cached-dtype -^^^^^^^^^^^^^^^^^^^^^^^ - -Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size -of ``kv-cache``. As a result, it reduces the cost required for reading and -writing the ``kv-cache``. - -To use this feature, specify ``--kv-cache-dtype`` as ``fp8``. - -To specify the quantization scaling config, use the -``--quantization-param-path`` parameter. If the parameter is not specified, -the default scaling factor of ``1`` is used, which can lead to less accurate -results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV -Cache `__ -in the vLLM GitHub repository. - -Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and -``llama2-7b``. - -If building the vLLM using -`Dockerfile.rocm `_ -for ``llama2-70b`` scale config, find the file at -``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at -runtime. - -Below is a sample command to run benchmarking with this feature enabled -for the ``llama2-70b`` model: - -.. code-block:: shell - - python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \ - /path/to/llama2-70b-model --kv-cache-dtype "fp8" \ - --quantization-param-path \ - "/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \ - --input-len 512 --output-len 256 --num-prompts 500 - - .. _mi300x-tunableop: PyTorch TunableOp @@ -946,33 +568,33 @@ for details. .. code-block:: shell - HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \ + HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \ --a_type f16_r --b_type f8_r --compute_type f32_f16_r \ - --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256 + --initialization trig_float --cold_iters 100 --iters 1000 --rotating 256 * Example 2: Benchmark forward epilogues and backward epilogues - * ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";`` + * ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";`` - * ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";`` + * ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";`` - * ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";`` + * ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";`` - * ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";`` + * ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";`` * ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";`` - * ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";`` + * ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";`` - * ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";`` + * ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";`` - * ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";`` + * ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";`` - * ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";`` + * ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";`` - * ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";`` + * ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";`` - * ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";`` + * ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";`` hipBLASLt auto-tuning using hipblaslt-bench @@ -1031,26 +653,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates .. code-block:: python - defaultBenchOptions = {"ProblemType": { -     "TransposeA": 0, -     "TransposeB": 0, -     "ComputeInputDataType": "s", -     "ComputeDataType": "s", -     "DataTypeC": "s", -     "DataTypeD": "s", -     "UseBias": False - }, "TestConfig": { -     "ColdIter": 20, -     "Iter": 100, -     "AlgoMethod": "all", -     "RequestedSolutions": 2, # Only works in AlgoMethod heuristic -     "SolutionIndex": None, # Only works in AlgoMethod index -     "ApiMethod": "cpp", -     "RotatingBuffer": 0, - }, "TuningParameters": { -     "SplitK": [0] - }, "ProblemSizes": []} - defaultCreateLogicOptions = {}  # Currently unused + defaultBenchOptions = {"ProblemType": { + "TransposeA": 0, + "TransposeB": 0, + "ComputeInputDataType": "s", + "ComputeDataType": "s", + "DataTypeC": "s", + "DataTypeD": "s", + "UseBias": False + }, "TestConfig": { + "ColdIter": 20, + "Iter": 100, + "AlgoMethod": "all", + "RequestedSolutions": 2, # Only works in AlgoMethod heuristic + "SolutionIndex": None, # Only works in AlgoMethod index + "ApiMethod": "cpp", + "RotatingBuffer": 0, + }, "TuningParameters": { + "SplitK": [0] + }, "ProblemSizes": []} + defaultCreateLogicOptions = {} # Currently unused * ``TestConfig`` 1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark. @@ -1230,7 +852,7 @@ command: .. code-block:: shell - merge.py original_dir new_tuned_yaml_dir output_dir  + merge.py original_dir new_tuned_yaml_dir output_dir The following table describes the logic YAML files. @@ -1833,7 +1455,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``. From the IR snippet, you can see ``i32`` data is loaded from global memory to registers (``%190``). With a few element-wise operations in registers, it is -stored in shared memory (``%269``) for the transpose operation (``%270``), which +stored in shared memory (``%269``) for the transpose operation (``%270``), which needs data movement across different threads. With the transpose done, it is loaded from LDS to register again (``%276``), and with a few more element-wise operations, it is stored to LDS again (``%298``). The last step @@ -1967,7 +1589,7 @@ something similar to the following: loaded at: [0x7fd4f100c000-0x7fd4f100e070] The kernel name and the code object file should be listed. In the -example above, the kernel name is vector_add_assert_trap, but this might +example above, the kernel name is vector_add_assert_trap, but this might also look like: .. code-block:: text @@ -2081,3 +1703,8 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment configuration to two compute streams and two RCCL streams, aligning with this best practice. Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node topology during startup, reducing the need for extensive manual tuning. + +Further reading +=============== + +* :doc:`vllm-optimization` diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index cbc2e7bce..a0a5084ff 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -134,6 +134,8 @@ subtrees: title: Profile and debug - file: how-to/rocm-for-ai/inference-optimization/workload.rst title: Workload optimization + - file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst + title: vLLM V1 performance optimization - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/ title: AI tutorials From 90c1d9068f0f81ea2d8e71142db814d9a05b6781 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Wed, 22 Oct 2025 13:47:46 -0400 Subject: [PATCH 07/15] add xref to vllm v1 optimization guide in workload.rst (#5560) --- .../inference-optimization/workload.rst | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst index 7cd2c7fc6..9e5fe4697 100644 --- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst +++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst @@ -99,12 +99,14 @@ execution. .. seealso:: - See :doc:`vllm-optimization`. + See :doc:`vllm-optimization` to learn more about vLLM performance + optimization techniques. .. _mi300x-auto-tune: Auto-tunable configurations ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Auto-tunable configurations can significantly streamline performance optimization by automatically adjusting parameters based on workload characteristics. For example: @@ -325,6 +327,22 @@ hardware counters are also included. ROCm Systems Profiler timeline trace example. +vLLM performance optimization +============================= + +vLLM is a high-throughput and memory efficient inference and serving engine for +large language models that has gained traction in the AI community for its +performance and ease of use. See :doc:`vllm-optimization`, where you'll learn +how to: + +* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models. +* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance. +* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton). +* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments. +* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss. +* Tune engine arguments (batch size, memory utilization, graph modes) for your use case. +* Benchmark and scale across single-node and multi-node configurations. + .. _mi300x-tunableop: PyTorch TunableOp From 35ca027aa4cc13cfde6a280cf1e761c9a14d7d7c Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Thu, 23 Oct 2025 14:39:58 -0400 Subject: [PATCH 08/15] Fix broken links under rocm-for-ai/ (#5564) --- docs/how-to/rocm-for-ai/system-setup/system-health-check.rst | 4 ++-- .../rocm-for-ai/training/benchmark-docker/megatron-lm.rst | 2 +- .../rocm-for-ai/training/benchmark-docker/primus-megatron.rst | 4 ++-- .../rocm-for-ai/training/benchmark-docker/primus-pytorch.rst | 2 +- .../training/benchmark-docker/pytorch-training.rst | 2 +- docs/how-to/rocm-for-ai/training/scale-model-training.rst | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst index 79563b61f..ac0b4ebd2 100644 --- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst +++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst @@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency. .. _healthcheck-install-transferbench: 1. To get started, use the instructions in the `TransferBench documentation - `_ + `__ or use the following commands: .. code:: shell @@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency. CC=hipcc make 2. Run the suggested TransferBench tests -- see `TransferBench benchmarking - `_ + `__ in the Instinct performance benchmarking documentation for instructions. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst index 86672adf5..6c8cf154f 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst @@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm `__ Docker Hub registry will be deprecated soon in favor of `rocm/primus `__. The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, - including Megatron-LM, `torchtitan, and torchtune `__. + including Megatron-LM and :doc:`torchtitan `. Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow. To learn how to migrate workloads from Megatron-LM to Primus with Megatron, diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst index 596473912..06cca9ed6 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst @@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo `__ Docker Hub registry will be deprecated soon in favor of `rocm/primus `__. The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, - including Megatron-LM, `torchtitan, and torchtune `__. + including Megatron-LM and :doc:`torchtitan `. Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM training ` workflow. To learn how to migrate workloads from @@ -183,7 +183,7 @@ Configuration ============= Primus defines a training configuration in YAML for each model in -`examples/megatron/configs `__. +`examples/megatron/configs `__. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst index b8ab934c6..d243800b8 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst @@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend. `__ Docker Hub registry will be deprecated soon in favor of `rocm/primus `__. The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, - including `Megatron-LM `__, torchtitan, and torchtune. + including torchtitan and :doc:`Megatron-LM `. Primus with the PyTorch torchtitan backend is designed to replace the :doc:`ROCm PyTorch training ` workflow. See diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst index a9c4de9f0..782cc61b3 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst @@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm `__ Docker Hub registry will be deprecated soon in favor of `rocm/primus `__. The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, - including `Megatron-LM `__, torchtitan, and torchtune. + including torchtitan and :doc:`Megatron-LM `. See :doc:`primus-pytorch` for details. diff --git a/docs/how-to/rocm-for-ai/training/scale-model-training.rst b/docs/how-to/rocm-for-ai/training/scale-model-training.rst index 2b2ce23e3..a74bb3e0f 100644 --- a/docs/how-to/rocm-for-ai/training/scale-model-training.rst +++ b/docs/how-to/rocm-for-ai/training/scale-model-training.rst @@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe See the following developer blogs for more in-depth explanations and examples. -* `Multi GPU training with DDP — PyTorch Tutorials `_ +* `Multi GPU training with DDP — PyTorch Tutorials `__ * `Building a decoder transformer model on AMD GPUs — ROCm Blogs `_ From 4132a2609c49ba2ef735539d816dcaa87b1ccae7 Mon Sep 17 00:00:00 2001 From: Kristoffer Date: Mon, 27 Oct 2025 14:56:55 +0100 Subject: [PATCH 09/15] Add xdit diffusion docs (#5576) * Add xdit video diffusion base page. * Update supported accelerators. * Remove dependency on mad-tags. * Update docker pull section. * Update container launch instructions. * Improve launch instruction options and layout. * Add benchmark result outputs. * Fix wrong HunyuanVideo path * Finalize instructions. * Consistent title. * Make page and side-bar titles the same. * Updated wordlist. Removed note container reg HF. * Remove fp8_gemms in command and add release notes. * Update accelerators naming. * Add note regarding OOB performance. * Fix admonition box. * Overall fixes. --- .wordlist.txt | 6 + docs/conf.py | 1 + .../inference/xdit-inference-models.yaml | 38 +++ docs/how-to/rocm-for-ai/inference/index.rst | 4 +- .../inference/xdit-video-diffusion.rst | 322 ++++++++++++++++++ docs/sphinx/_toc.yml.in | 2 + 6 files changed, 372 insertions(+), 1 deletion(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst diff --git a/.wordlist.txt b/.wordlist.txt index aed9dc1cc..68185fbe9 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -220,6 +220,7 @@ href Hyperparameters HybridEngine Huggingface +Hunyuan IB ICD ICT @@ -531,6 +532,7 @@ UAC UC UCC UCX +ud UE UIF UMC @@ -842,6 +844,7 @@ pallas parallelization parallelizing param +params parameterization passthrough pe @@ -888,6 +891,7 @@ querySelectorAll queueing qwen radeon +rc rccl rdc rdma @@ -1052,6 +1056,8 @@ writebacks wrreq wzo xargs +xdit +xDiT xGMI xPacked xz diff --git a/docs/conf.py b/docs/conf.py index 5a6298e04..85c6863ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -175,6 +175,7 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/xdit-video-diffusion", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml new file mode 100644 index 000000000..60f52aae7 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -0,0 +1,38 @@ +xdit_video_diffusion: + docker: + pull_tag: amdsiloai/pytorch-xdit:v25.9 + docker_hub_url: https://hub.docker.com/r/amdsiloai/pytorch-xdit + ROCm: 7.0.0rc + components: + TheRock: 7afbe45 + rccl: 9b04b2a + composable_kernel: b7a806f + rocm-libraries: f104555 + rocm-systems: 25922d0 + torch: 2.10.0a0+git3caf6da + torchvision: 0.22.0a0+966da7e + triton: 3.5.0+gitea06d636 + + model_groups: + - group: Hunyuan Video + tag: hunyuan + models: + - model: Hunyuan Video + model_name: hunyuanvideo + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + - group: Wan-AI + tag: wan + models: + - model: Wan2.1 + model_name: wan2.1_i2v_14b_720p + model_repo: Wan-AI/Wan2.1-I2V-14B-720P + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + github: https://github.com/Wan-Video/Wan2.1 + - model: Wan2.2 + model_name: wan2.2-i2v-a14b + model_repo: Wan-AI/Wan2.2-I2V-A14B + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + github: https://github.com/Wan-Video/Wan2.2 \ No newline at end of file diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst index 6eb705141..4f66fd82f 100644 --- a/docs/how-to/rocm-for-ai/inference/index.rst +++ b/docs/how-to/rocm-for-ai/inference/index.rst @@ -26,4 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram - :doc:`SGLang inference performance testing ` -- :doc:`Deploying your model ` +- :doc:`xDiT video inference ` + +- :doc:`Deploying your model ` \ No newline at end of file diff --git a/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst new file mode 100644 index 000000000..af98cc187 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst @@ -0,0 +1,322 @@ +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, validate, benchmark + +******************** +xDiT video inference +******************** + +.. _xdit-video-diffusion: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + The `amdsiloai/pytorch-xdit Docker <{{ docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video generation on + AMD Instinct™ MI355X, MI350X (gfx950), and MI300X GPUs. + The image runs ROCm `{{docker.ROCm}}` based on `TheRock `_ + and includes the following components: + + .. tab-set:: + + .. tab-item:: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. + +What's new +========== + +- Initial release +- ROCm: 7.0.0rc +- Added support for AMD Instinct™ MI355X, MI350X (gfx950), and MI300X (gfx942) GPUs. +- Added support for Wan 2.1, Wan 2.2 and Hunyuan Video models with MIOpen optimizations. + +.. _xdit-video-diffusion-supported-models: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.model_name}} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + + For this tutorial, it's recommended to use the ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +Once the image has been downloaded you can follow these steps to +run benchmarks and generate a video. + +.. warning:: + + If your host/OS ROCm installation is below 6.4.2 (see with ``apt show rocm-libs``) you need to export + the ``HSA_NO_SCRATCH_RECLAIM=1`` environment variable inside the container, or the workload will crash. + If possible, ask your system administrator to upgrade ROCm. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.model_name}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{model.model_name}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set model_groups = data.xdit_video_diffusion.model_groups%} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.model_name }} + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model tencent/HunyuanVideo \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan2.1 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ + --image "/app/Wan2.1/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan2.2 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-A14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ + --image "/app/Wan2.2/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 + {% endif %} + + {% if model.model in ["Wan2.1", "Wan2.2"] %} + For additional performance improvements, consider adding the ``--compile`` flag to the above command. Note that this can significantly increase startup time on the first call. + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% endif %} + + {% endfor %} + {% endfor %} + +Known limitations +================= + +- OOB tuning: Currently only Instinct MI300X has been tuned for in the gfx942 + series. Other gfx942 GPUs might not perform optimally out-of-the-box. \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a0a5084ff..253f4416f 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -117,6 +117,8 @@ subtrees: title: SGLang inference performance testing - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst title: SGLang distributed inference with Mooncake + - file: how-to/rocm-for-ai/inference/xdit-video-diffusion.rst + title: xDiT video inference - file: how-to/rocm-for-ai/inference/deploy-your-model.rst title: Deploy your model From 248cbf8bc17656487ad8f70c4903b07fa647211c Mon Sep 17 00:00:00 2001 From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:14:30 -0400 Subject: [PATCH 10/15] [External CI] rccl triggers rocprofiler-sdk downstream (#5420) - Update rccl component pipeline to include new additions made to projects already in super repos. - Also update rccl to trigger rocproifler-sdk job upon completion. - rocprofiler-sdk pipeline updated to include os parameter to enable future almalinux 8 job. --- .azuredevops/components/rccl.yml | 189 ++++++++++++++------ .azuredevops/components/rocprofiler-sdk.yml | 34 ++-- 2 files changed, 152 insertions(+), 71 deletions(-) diff --git a/.azuredevops/components/rccl.yml b/.azuredevops/components/rccl.yml index 02af4ed68..b8bfd8c4a 100644 --- a/.azuredevops/components/rccl.yml +++ b/.azuredevops/components/rccl.yml @@ -1,10 +1,35 @@ parameters: +- name: componentName + type: string + default: rccl - name: checkoutRepo type: string default: 'self' - name: checkoutRef type: string default: '' +- name: systemsRepo + type: string + default: systems_repo +- name: systemsSparseCheckoutDir + type: string + default: 'projects/rocprofiler-sdk' +# monorepo related parameters +- name: sparseCheckoutDir + type: string + default: '' +- name: triggerDownstreamJobs + type: boolean + default: false +- name: downstreamAggregateNames + type: string + default: '' +- name: buildDependsOn + type: object + default: null +- name: unifiedBuild + type: boolean + default: false # set to true if doing full build of ROCm stack # and dependencies are pulled from same pipeline - name: aggregatePipeline @@ -57,19 +82,28 @@ parameters: type: object default: buildJobs: - - gfx942: - target: gfx942 - - gfx90a: - target: gfx90a + - { os: ubuntu2204, packageManager: apt, target: gfx942 } + - { os: ubuntu2204, packageManager: apt, target: gfx90a } testJobs: - - gfx942: - target: gfx942 - - gfx90a: - target: gfx90a + - { os: ubuntu2204, packageManager: apt, target: gfx942 } + - { os: ubuntu2204, packageManager: apt, target: gfx90a } +- name: downstreamComponentMatrix + type: object + default: + - rocprofiler-sdk: + name: rocprofiler-sdk + sparseCheckoutDir: '' + skipUnifiedBuild: 'false' + buildDependsOn: + - rccl_build jobs: - ${{ each job in parameters.jobMatrix.buildJobs }}: - - job: rccl_build_${{ job.target }} + - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }} + ${{ if parameters.buildDependsOn }}: + dependsOn: + - ${{ each build in parameters.buildDependsOn }}: + - ${{ build }}_${{ job.os }}_${{ job.target }} timeoutInMinutes: 120 variables: - group: common @@ -77,17 +111,23 @@ jobs: - name: HIP_ROCCLR_HOME value: $(Build.BinariesDirectory)/rocm pool: ${{ variables.MEDIUM_BUILD_POOL }} + ${{ if eq(job.os, 'almalinux8') }}: + container: + image: rocmexternalcicd.azurecr.io/manylinux228:latest + endpoint: ContainerService3 workspace: clean: all steps: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml parameters: aptPackages: ${{ parameters.aptPackages }} + packageManager: ${{ job.packageManager }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml parameters: checkoutRepo: ${{ parameters.checkoutRepo }} + sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }} submoduleBehaviour: recursive - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml parameters: @@ -97,10 +137,14 @@ jobs: parameters: checkoutRef: ${{ parameters.checkoutRef }} dependencyList: ${{ parameters.rocmDependencies }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} aggregatePipeline: ${{ parameters.aggregatePipeline }} + ${{ if parameters.triggerDownstreamJobs }}: + downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml parameters: + os: ${{ job.os }} extraBuildFlags: >- -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc @@ -112,58 +156,87 @@ jobs: -GNinja - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml parameters: + componentName: ${{ parameters.componentName }} + sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml parameters: + componentName: ${{ parameters.componentName }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml - parameters: - aptPackages: ${{ parameters.aptPackages }} - gpuTarget: ${{ job.target }} - extraEnvVars: - - HIP_ROCCLR_HOME:::/home/user/workspace/rocm - installLatestCMake: true + - ${{ if eq(job.os, 'ubuntu2204') }}: + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml + parameters: + aptPackages: ${{ parameters.aptPackages }} + gpuTarget: ${{ job.target }} + extraEnvVars: + - HIP_ROCCLR_HOME:::/home/user/workspace/rocm + installLatestCMake: true -- ${{ each job in parameters.jobMatrix.testJobs }}: - - job: rccl_test_${{ job.target }} - timeoutInMinutes: 120 - dependsOn: rccl_build_${{ job.target }} - condition: - and(succeeded(), - eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'), - not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])), - eq(${{ parameters.aggregatePipeline }}, False) - ) - variables: - - group: common - - template: /.azuredevops/variables-global.yml - pool: ${{ job.target }}_test_pool - workspace: - clean: all - steps: - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml - parameters: - aptPackages: ${{ parameters.aptPackages }} - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml - parameters: - gpuTarget: ${{ job.target }} - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml - parameters: - checkoutRef: ${{ parameters.checkoutRef }} - dependencyList: ${{ parameters.rocmTestDependencies }} - gpuTarget: ${{ job.target }} - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml - parameters: - componentName: rccl - testDir: '$(Agent.BuildDirectory)/rocm/bin' - testExecutable: './rccl-UnitTests' - testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' - - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml - parameters: - aptPackages: ${{ parameters.aptPackages }} - environment: test - gpuTarget: ${{ job.target }} +- ${{ if eq(parameters.unifiedBuild, False) }}: + - ${{ each job in parameters.jobMatrix.testJobs }}: + - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }} + timeoutInMinutes: 120 + dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }} + condition: + and(succeeded(), + eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'), + not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')), + eq(${{ parameters.aggregatePipeline }}, False) + ) + variables: + - group: common + - template: /.azuredevops/variables-global.yml + pool: ${{ job.target }}_test_pool + workspace: + clean: all + steps: + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml + parameters: + aptPackages: ${{ parameters.aptPackages }} + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml + parameters: + preTargetFilter: ${{ parameters.componentName }} + os: ${{ job.os }} + gpuTarget: ${{ job.target }} + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml + parameters: + checkoutRef: ${{ parameters.checkoutRef }} + dependencyList: ${{ parameters.rocmTestDependencies }} + os: ${{ job.os }} + gpuTarget: ${{ job.target }} + ${{ if parameters.triggerDownstreamJobs }}: + downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }} + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml + parameters: + componentName: ${{ parameters.componentName }} + os: ${{ job.os }} + testDir: '$(Agent.BuildDirectory)/rocm/bin' + testExecutable: './rccl-UnitTests' + testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' + - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml + parameters: + aptPackages: ${{ parameters.aptPackages }} + environment: test + gpuTarget: ${{ job.target }} + +- ${{ if parameters.triggerDownstreamJobs }}: + - ${{ each component in parameters.downstreamComponentMatrix }}: + - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}: + - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo + parameters: + checkoutRepo: ${{ parameters.systemsRepo }} + sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }} + triggerDownstreamJobs: true + unifiedBuild: ${{ parameters.unifiedBuild }} + ${{ if parameters.unifiedBuild }}: + buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }} + downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }} + ${{ else }}: + buildDependsOn: ${{ component.buildDependsOn }} + downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }} diff --git a/.azuredevops/components/rocprofiler-sdk.yml b/.azuredevops/components/rocprofiler-sdk.yml index 596400efb..b551f46fb 100644 --- a/.azuredevops/components/rocprofiler-sdk.yml +++ b/.azuredevops/components/rocprofiler-sdk.yml @@ -79,27 +79,27 @@ parameters: type: object default: buildJobs: - - gfx942: - target: gfx942 - - gfx90a: - target: gfx90a + - { os: ubuntu2204, packageManager: apt, target: gfx942 } + - { os: ubuntu2204, packageManager: apt, target: gfx90a } testJobs: - - gfx942: - target: gfx942 - - gfx90a: - target: gfx90a + - { os: ubuntu2204, packageManager: apt, target: gfx942 } + - { os: ubuntu2204, packageManager: apt, target: gfx90a } jobs: - ${{ each job in parameters.jobMatrix.buildJobs }}: - - job: rocprofiler_sdk_build_${{ job.target }} + - job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }} ${{ if parameters.buildDependsOn }}: dependsOn: - ${{ each build in parameters.buildDependsOn }}: - - ${{ build }}_${{ job.target }} + - ${{ build }}_${{ job.os}}_${{ job.target }} variables: - group: common - template: /.azuredevops/variables-global.yml pool: ${{ variables.MEDIUM_BUILD_POOL }} + ${{ if eq(job.os, 'almalinux8') }}: + container: + image: rocmexternalcicd.azurecr.io/manylinux228:latest + endpoint: ContainerService3 workspace: clean: all steps: @@ -107,6 +107,7 @@ jobs: parameters: aptPackages: ${{ parameters.aptPackages }} pipModules: ${{ parameters.pipModules }} + packageManager: ${{ job.packageManager }} registerROCmPackages: true - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml @@ -118,6 +119,7 @@ jobs: parameters: checkoutRef: ${{ parameters.checkoutRef }} dependencyList: ${{ parameters.rocmDependencies }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} aggregatePipeline: ${{ parameters.aggregatePipeline }} ${{ if parameters.triggerDownstreamJobs }}: @@ -132,6 +134,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml parameters: componentName: ${{ parameters.componentName }} + os: ${{ job.os }} extraBuildFlags: >- -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm -DROCPROFILER_BUILD_TESTS=ON @@ -143,6 +146,7 @@ jobs: parameters: componentName: ${{ parameters.componentName }} sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml parameters: @@ -158,8 +162,8 @@ jobs: - ${{ if eq(parameters.unifiedBuild, False) }}: - ${{ each job in parameters.jobMatrix.testJobs }}: - - job: rocprofiler_sdk_test_${{ job.target }} - dependsOn: rocprofiler_sdk_build_${{ job.target }} + - job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }} + dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }} condition: and(succeeded(), eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'), @@ -177,6 +181,7 @@ jobs: parameters: aptPackages: ${{ parameters.aptPackages }} pipModules: ${{ parameters.pipModules }} + packageManager: ${{ job.packageManager }} registerROCmPackages: true - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml @@ -188,6 +193,7 @@ jobs: parameters: checkoutRef: ${{ parameters.checkoutRef }} dependencyList: ${{ parameters.rocmDependencies }} + os: ${{ job.os }} gpuTarget: ${{ job.target }} ${{ if parameters.triggerDownstreamJobs }}: downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }} @@ -202,6 +208,7 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml parameters: componentName: ${{ parameters.componentName }} + os: ${{ job.os }} extraBuildFlags: >- -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm -DROCPROFILER_BUILD_TESTS=ON @@ -213,7 +220,8 @@ jobs: - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml parameters: componentName: ${{ parameters.componentName }} - testDir: $(Agent.BuildDirectory)/s/build + os: ${{ job.os }} + testDir: $(Agent.BuildDirectory)/build - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml parameters: aptPackages: ${{ parameters.aptPackages }} From 1515fb3779d943a65dd509e6820b61a296fda6f8 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Mon, 27 Oct 2025 16:22:28 -0400 Subject: [PATCH 11/15] Revert "Add xdit diffusion docs (#5576)" (#5580) This reverts commit 4132a2609c49ba2ef735539d816dcaa87b1ccae7. --- .wordlist.txt | 6 - docs/conf.py | 1 - .../inference/xdit-inference-models.yaml | 38 --- docs/how-to/rocm-for-ai/inference/index.rst | 4 +- .../inference/xdit-video-diffusion.rst | 322 ------------------ docs/sphinx/_toc.yml.in | 2 - 6 files changed, 1 insertion(+), 372 deletions(-) delete mode 100644 docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml delete mode 100644 docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst diff --git a/.wordlist.txt b/.wordlist.txt index 68185fbe9..aed9dc1cc 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -220,7 +220,6 @@ href Hyperparameters HybridEngine Huggingface -Hunyuan IB ICD ICT @@ -532,7 +531,6 @@ UAC UC UCC UCX -ud UE UIF UMC @@ -844,7 +842,6 @@ pallas parallelization parallelizing param -params parameterization passthrough pe @@ -891,7 +888,6 @@ querySelectorAll queueing qwen radeon -rc rccl rdc rdma @@ -1056,8 +1052,6 @@ writebacks wrreq wzo xargs -xdit -xDiT xGMI xPacked xz diff --git a/docs/conf.py b/docs/conf.py index 85c6863ba..5a6298e04 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -175,7 +175,6 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, - {"file": "how-to/rocm-for-ai/inference/xdit-video-diffusion", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml deleted file mode 100644 index 60f52aae7..000000000 --- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml +++ /dev/null @@ -1,38 +0,0 @@ -xdit_video_diffusion: - docker: - pull_tag: amdsiloai/pytorch-xdit:v25.9 - docker_hub_url: https://hub.docker.com/r/amdsiloai/pytorch-xdit - ROCm: 7.0.0rc - components: - TheRock: 7afbe45 - rccl: 9b04b2a - composable_kernel: b7a806f - rocm-libraries: f104555 - rocm-systems: 25922d0 - torch: 2.10.0a0+git3caf6da - torchvision: 0.22.0a0+966da7e - triton: 3.5.0+gitea06d636 - - model_groups: - - group: Hunyuan Video - tag: hunyuan - models: - - model: Hunyuan Video - model_name: hunyuanvideo - model_repo: tencent/HunyuanVideo - revision: refs/pr/18 - url: https://huggingface.co/tencent/HunyuanVideo - github: https://github.com/Tencent-Hunyuan/HunyuanVideo - - group: Wan-AI - tag: wan - models: - - model: Wan2.1 - model_name: wan2.1_i2v_14b_720p - model_repo: Wan-AI/Wan2.1-I2V-14B-720P - url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P - github: https://github.com/Wan-Video/Wan2.1 - - model: Wan2.2 - model_name: wan2.2-i2v-a14b - model_repo: Wan-AI/Wan2.2-I2V-A14B - url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B - github: https://github.com/Wan-Video/Wan2.2 \ No newline at end of file diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst index 4f66fd82f..6eb705141 100644 --- a/docs/how-to/rocm-for-ai/inference/index.rst +++ b/docs/how-to/rocm-for-ai/inference/index.rst @@ -26,6 +26,4 @@ training, fine-tuning, and inference. It leverages popular machine learning fram - :doc:`SGLang inference performance testing ` -- :doc:`xDiT video inference ` - -- :doc:`Deploying your model ` \ No newline at end of file +- :doc:`Deploying your model ` diff --git a/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst deleted file mode 100644 index af98cc187..000000000 --- a/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst +++ /dev/null @@ -1,322 +0,0 @@ -.. meta:: - :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using - prebuilt and optimized docker images. - :keywords: xDiT, diffusion, video, video generation, validate, benchmark - -******************** -xDiT video inference -******************** - -.. _xdit-video-diffusion: - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% set docker = data.xdit_video_diffusion.docker %} - {% set model_groups = data.xdit_video_diffusion.model_groups%} - - The `amdsiloai/pytorch-xdit Docker <{{ docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment based on `xDiT `_ for - benchmarking diffusion model video generation on - AMD Instinct™ MI355X, MI350X (gfx950), and MI300X GPUs. - The image runs ROCm `{{docker.ROCm}}` based on `TheRock `_ - and includes the following components: - - .. tab-set:: - - .. tab-item:: {{ docker.pull_tag }} - - .. list-table:: - :header-rows: 1 - - * - Software component - - Version - - {% for component_name, component_version in docker.components.items() %} - * - {{ component_name }} - - {{ component_version }} - {% endfor %} - -Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. - -What's new -========== - -- Initial release -- ROCm: 7.0.0rc -- Added support for AMD Instinct™ MI355X, MI350X (gfx950), and MI300X (gfx942) GPUs. -- Added support for Wan 2.1, Wan 2.2 and Hunyuan Video models with MIOpen optimizations. - -.. _xdit-video-diffusion-supported-models: - -Supported models -================ - -The following models are supported for inference performance benchmarking. -Some instructions, commands, and recommendations in this documentation might -vary by model -- select one to get started. - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% set docker = data.xdit_video_diffusion.docker %} - {% set model_groups = data.xdit_video_diffusion.model_groups%} - - .. raw:: html - -
-
-
Model
-
- {% for model_group in model_groups %} -
{{ model_group.group }}
- {% endfor %} -
-
- -
-
Variant
-
- {% for model_group in model_groups %} - {% set models = model_group.models %} - {% for model in models %} - {% if models|length % 3 == 0 %} -
{{ model.model }}
- {% else %} -
{{ model.model }}
- {% endif %} - {% endfor %} - {% endfor %} -
-
-
- - {% for model_group in model_groups %} - {% for model in model_group.models %} - - .. container:: model-doc {{model.model_name}} - - .. note:: - - To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ - or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an - external license agreement through a third party. - - {% endfor %} - {% endfor %} - -System validation -================= - -Before running AI workloads, it's important to validate that your AMD hardware is configured -correctly and performing optimally. - -If you have already validated your system settings, including aspects like NUMA auto-balancing, you -can skip this step. Otherwise, complete the procedures in the :ref:`System validation and -optimization ` guide to properly configure your system settings -before starting. - -To test for optimal performance, consult the recommended :ref:`System health benchmarks -`. This suite of tests will help you verify and fine-tune your -system's configuration. - -Pull the Docker image -===================== - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% set docker = data.xdit_video_diffusion.docker %} - - For this tutorial, it's recommended to use the ``{{ docker.pull_tag }}`` Docker image. - Pull the image using the following command: - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - -Validate and benchmark -====================== - -Once the image has been downloaded you can follow these steps to -run benchmarks and generate a video. - -.. warning:: - - If your host/OS ROCm installation is below 6.4.2 (see with ``apt show rocm-libs``) you need to export - the ``HSA_NO_SCRATCH_RECLAIM=1`` environment variable inside the container, or the workload will crash. - If possible, ask your system administrator to upgrade ROCm. - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% for model_group in model_groups %} - {% for model in model_group.models %} - - .. container:: model-doc {{model.model_name}} - - The following commands are written for {{ model.model }}. - See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. - - {% endfor %} - {% endfor %} - -Choose your setup method ------------------------- - -You can either use an existing Hugging Face cache or download the model fresh inside the container. - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% set docker = data.xdit_video_diffusion.docker %} - {% set model_groups = data.xdit_video_diffusion.model_groups%} - - {% for model_group in model_groups %} - {% for model in model_group.models %} - .. container:: model-doc {{model.model_name}} - - .. tab-set:: - - .. tab-item:: Option 1: Use existing Hugging Face cache - - If you already have models downloaded on your host system, you can mount your existing cache. - - 1. Set your Hugging Face cache location. - - .. code-block:: shell - - export HF_HOME=/your/hf_cache/location - - 2. Download the model (if not already cached). - - .. code-block:: shell - - huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} - - 3. Launch the container with mounted cache. - - .. code-block:: shell - - docker run \ - -it --rm \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --user root \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --ipc=host \ - --network host \ - --privileged \ - --shm-size 128G \ - --name pytorch-xdit \ - -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - -e HF_HOME=/app/huggingface_models \ - -v $HF_HOME:/app/huggingface_models \ - {{ docker.pull_tag }} - - .. tab-item:: Option 2: Download inside container - - If you prefer to keep the container self-contained or don't have an existing cache. - - 1. Launch the container - - .. code-block:: shell - - docker run \ - -it --rm \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --user root \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --ipc=host \ - --network host \ - --privileged \ - --shm-size 128G \ - --name pytorch-xdit \ - -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ - {{ docker.pull_tag }} - - 2. Inside the container, set the Hugging Face cache location and download the model. - - .. code-block:: shell - - export HF_HOME=/your/hf_cache/location - huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} - - .. warning:: - - Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. - {% endfor %} - {% endfor %} - -Run inference -============= - -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - - {% set model_groups = data.xdit_video_diffusion.model_groups%} - {% for model_group in model_groups %} - {% for model in model_group.models %} - - .. container:: model-doc {{ model.model_name }} - - To run the benchmarks for {{ model.model }}, use the following command: - - .. code-block:: shell - {% if model.model == "Hunyuan Video" %} - cd /app/Hunyuanvideo - mkdir results - - torchrun --nproc_per_node=8 run.py \ - --model tencent/HunyuanVideo \ - --prompt "In the large cage, two puppies were wagging their tails at each other." \ - --height 720 --width 1280 --num_frames 129 \ - --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ - --ulysses_degree 8 \ - --enable_tiling --enable_slicing \ - --use_torch_compile \ - --bench_output results - {% endif %} - {% if model.model == "Wan2.1" %} - cd Wan2.1 - mkdir results - - torchrun --nproc_per_node=8 run.py \ - --task i2v-14B \ - --size 720*1280 --frame_num 81 \ - --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ - --image "/app/Wan2.1/examples/i2v_input.JPG" \ - --ulysses_size 8 --ring_size 1 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ - --offload_model 0 \ - --vae_dtype bfloat16 - {% endif %} - {% if model.model == "Wan2.2" %} - cd Wan2.2 - mkdir results - - torchrun --nproc_per_node=8 run.py \ - --task i2v-A14B \ - --size 720*1280 --frame_num 81 \ - --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ - --image "/app/Wan2.2/examples/i2v_input.JPG" \ - --ulysses_size 8 --ring_size 1 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ - --offload_model 0 \ - --vae_dtype bfloat16 - {% endif %} - - {% if model.model in ["Wan2.1", "Wan2.2"] %} - For additional performance improvements, consider adding the ``--compile`` flag to the above command. Note that this can significantly increase startup time on the first call. - {% endif %} - - The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% endif %} - - {% endfor %} - {% endfor %} - -Known limitations -================= - -- OOB tuning: Currently only Instinct MI300X has been tuned for in the gfx942 - series. Other gfx942 GPUs might not perform optimally out-of-the-box. \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 253f4416f..a0a5084ff 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -117,8 +117,6 @@ subtrees: title: SGLang inference performance testing - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst title: SGLang distributed inference with Mooncake - - file: how-to/rocm-for-ai/inference/xdit-video-diffusion.rst - title: xDiT video inference - file: how-to/rocm-for-ai/inference/deploy-your-model.rst title: Deploy your model From 43ccfbbe80d7ef309691d4d174670cb6d160be77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:06:22 -0400 Subject: [PATCH 12/15] Bump rocm-docs-core from 1.26.0 to 1.27.0 in /docs/sphinx (#5570) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.26.0 to 1.27.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.26.0...v1.27.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-version: 1.27.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/sphinx/requirements.in | 2 +- docs/sphinx/requirements.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 713f8e931..4b700d8b7 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,4 +1,4 @@ -rocm-docs-core==1.26.0 +rocm-docs-core==1.27.0 sphinx-reredirects sphinx-sitemap sphinxcontrib.datatemplates==0.11.0 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 3b8d22771..832b43108 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -187,8 +187,8 @@ requests==2.32.5 # via # pygithub # sphinx -rocm-docs-core==1.26.0 - # via -r docs/sphinx/requirements.in +rocm-docs-core==1.27.0 + # via -r requirements.in rpds-py==0.27.1 # via # jsonschema @@ -230,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8 sphinx-notfound-page==1.1.0 # via rocm-docs-core sphinx-reredirects==0.1.6 - # via -r docs/sphinx/requirements.in + # via -r requirements.in sphinx-sitemap==2.9.0 - # via -r docs/sphinx/requirements.in + # via -r requirements.in sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-datatemplates==0.11.0 - # via -r docs/sphinx/requirements.in + # via -r requirements.in sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-htmlhelp==2.1.0 From 61fffe3250246317406980c3312a483f0077261f Mon Sep 17 00:00:00 2001 From: Pratik Basyal Date: Tue, 28 Oct 2025 15:16:15 -0400 Subject: [PATCH 13/15] 7.0.2 Broken link, version and known issue update (#5591) * Version and known issue update * Historical compatibility updated --- RELEASE.md | 4 ++-- docs/compatibility/compatibility-matrix-historical-6.0.csv | 2 +- docs/compatibility/compatibility-matrix.rst | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 08ea37b53..e32483119 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -190,7 +190,7 @@ ROCm-LS provides the following tools to build a complete workflow for life scien * The hipCIM library provides powerful support for GPU-accelerated I/O operations, coupled with an array of computer vision and image processing primitives designed for N-dimensional image data in fields such as biomedical imaging. For more information, see the [hipCIM documentation](https://rocm.docs.amd.com/projects/hipCIM/en/latest/). -* MONAI for AMD ROCm, a ROCm-enabled version of [MONAI](https://monai.io/), is built on top of [PyTorch for AMD ROCm](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/), helping healthcare and life science innovators to leverage GPU acceleration with AMD Instinct GPUs for high-performance inference and training of medical AI applications. For more information, see the [MONAI for AMD ROCm documentation](https://rocm.docs.amd.com/projects/monai/en/latest/). +* MONAI for AMD ROCm, a ROCm-enabled version of {fab}`github` [MONAI](https://github.com/Project-MONAI/MONAI), is built on top of [PyTorch for AMD ROCm](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/), helping healthcare and life science innovators to leverage GPU acceleration with AMD Instinct GPUs for high-performance inference and training of medical AI applications. For more information, see the [MONAI for AMD ROCm documentation](https://rocm.docs.amd.com/projects/monai/en/latest/). ### Deep learning and AI framework updates @@ -712,7 +712,7 @@ The issue will be resolved in a future ROCm release. See [GitHub issue #5500](ht ### Applications using OpenCV might fail due to package incompatibility between the OS -OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. To avoid the version conflict, rebuild OpenCV with the version corresponding to Debian 13, then rebuild MIVisionX on top of it. As a workaround, rebuild OpenCV from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release. See [GitHub issue #5501](https://github.com/ROCm/ROCm/issues/5501). +OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. As a workaround, rebuild OpenCV with the version corresponding to Debian 13 from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release. See [GitHub issue #5501](https://github.com/ROCm/ROCm/issues/5501). ## ROCm upcoming changes diff --git a/docs/compatibility/compatibility-matrix-historical-6.0.csv b/docs/compatibility/compatibility-matrix-historical-6.0.csv index 4d143f54c..ea21da05a 100644 --- a/docs/compatibility/compatibility-matrix-historical-6.0.csv +++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv @@ -96,7 +96,7 @@ ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6 :doc:`rocThrust `,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0 ,,,,,,,,,,,,,,,,,,,, SUPPORT LIBS,,,,,,,,,,,,,,,,,,,, - `hipother `_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830 + `hipother `_,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830 `rocm-core `_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0 `ROCT-Thunk-Interface `_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245 ,,,,,,,,,,,,,,,,,,,, diff --git a/docs/compatibility/compatibility-matrix.rst b/docs/compatibility/compatibility-matrix.rst index 661c935dd..e0c6fa727 100644 --- a/docs/compatibility/compatibility-matrix.rst +++ b/docs/compatibility/compatibility-matrix.rst @@ -113,7 +113,7 @@ compatibility and system requirements. :doc:`rocThrust `,4.0.0,4.0.0,3.3.0 ,,, SUPPORT LIBS,,, - `hipother `_,7.0.51830,7.0.51830,6.4.43482 + `hipother `_,7.0.51831,7.0.51830,6.4.43482 `rocm-core `_,7.0.2,7.0.1/7.0.0,6.4.0 `ROCT-Thunk-Interface `_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_ ,,, From 9b3138cffadafa47a4b10b65561f7b547ee91aa9 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Tue, 21 Oct 2025 16:23:27 +0200 Subject: [PATCH 14/15] [Ex CI] Add aomp, aomp-extras, composable_kernel and rocALUTION Remove libomp-dev Signed-off-by: Jan Stephan --- .azuredevops/components/rocm-examples.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.azuredevops/components/rocm-examples.yml b/.azuredevops/components/rocm-examples.yml index cd9343bb3..63dc4fdde 100644 --- a/.azuredevops/components/rocm-examples.yml +++ b/.azuredevops/components/rocm-examples.yml @@ -17,7 +17,6 @@ parameters: - libdw-dev - libglfw3-dev - libmsgpack-dev - - libomp-dev - libopencv-dev - libtbb-dev - libtiff-dev @@ -31,7 +30,10 @@ parameters: type: object default: - AMDMIGraphX + - aomp + - aomp-extras - clr + - composable_kernel - hipBLAS - hipBLAS-common - hipBLASLt @@ -45,6 +47,7 @@ parameters: - llvm-project - MIOpen - MIVisionX + - rocALUTION - rocBLAS - rocDecode - rocFFT @@ -63,7 +66,10 @@ parameters: type: object default: - AMDMIGraphX + - aomp + - aomp-extras - clr + - composable_kernel - hipBLAS - hipBLAS-common - hipBLASLt @@ -77,6 +83,7 @@ parameters: - llvm-project - MIOpen - MIVisionX + - rocALUTION - rocBLAS - rocDecode - rocFFT From 36c879b7e00443eac5ea8882d45747d7b0f0c449 Mon Sep 17 00:00:00 2001 From: Alex Xu Date: Wed, 29 Oct 2025 17:07:57 -0400 Subject: [PATCH 15/15] resolve merge conflict --- RELEASE.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index affff5d39..843ce4fdd 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1490,7 +1490,6 @@ The issue will be resolved in a future ROCm release. See [GitHub issue #5500](ht ## ROCm resolved issues -<<<<<<< HEAD The following are previously known issues resolved in this release. For resolved issues related to individual components, review the [Detailed component changes](#detailed-component-changes). @@ -1501,9 +1500,6 @@ An issue of segmentation fault in ROCprofiler-SDK that uses `std::regex` has bee ### Clang compilation failure might occur due to incorrectly installed GNU C++ runtime An issue of Clang compilation failing with the error `fatal error: 'cmath' file not found` if the GNU C++ runtime was not installed correctly has been resolved. The error indicated that the `libstdc++-dev` package, compatible with the latest installed GNU Compiler Collection (GCC) version, was missing. This issue was a result of Clang being unable to find the newest GNU C++ runtimes it recognizes and the associated header files. See [GitHub issue #4612](https://github.com/ROCm/ROCm/issues/4612). -======= -OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. As a workaround, rebuild OpenCV with the version corresponding to Debian 13 from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release. See [GitHub issue #5501](https://github.com/ROCm/ROCm/issues/5501). ->>>>>>> external/develop ## ROCm upcoming changes