Update vLLM inference benchmark doc for 0715 release (#5058)

This commit is contained in:
Peter Park
2025-07-17 15:00:02 -04:00
committed by GitHub
parent 09460f7332
commit b437a625b3
5 changed files with 780 additions and 143 deletions

View File

@@ -0,0 +1,163 @@
vllm_benchmark:
unified_docker:
latest:
# TODO: update me
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
rocm_version: 6.4.1
vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
pytorch_version: 2.7.0+gitf717b2a
hipblaslt_version: 0.15
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: pyt_vllm_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: float16
- model: Llama 3.1 70B
mad_tag: pyt_vllm_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: float16
- model: Llama 3.1 405B
mad_tag: pyt_vllm_llama-3.1-405b
model_repo: meta-llama/Llama-3.1-405B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
precision: float16
- model: Llama 2 7B
mad_tag: pyt_vllm_llama-2-7b
model_repo: meta-llama/Llama-2-7b-chat-hf
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
precision: float16
- model: Llama 2 70B
mad_tag: pyt_vllm_llama-2-70b
model_repo: meta-llama/Llama-2-70b-chat-hf
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
precision: float16
- model: Llama 3.1 8B FP8
mad_tag: pyt_vllm_llama-3.1-8b_fp8
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
precision: float8
- model: Llama 3.1 70B FP8
mad_tag: pyt_vllm_llama-3.1-70b_fp8
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
precision: float8
- model: Llama 3.1 405B FP8
mad_tag: pyt_vllm_llama-3.1-405b_fp8
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
precision: float8
- group: Mistral AI
tag: mistral
models:
- model: Mixtral MoE 8x7B
mad_tag: pyt_vllm_mixtral-8x7b
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
precision: float16
- model: Mixtral MoE 8x22B
mad_tag: pyt_vllm_mixtral-8x22b
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
precision: float16
- model: Mistral 7B
mad_tag: pyt_vllm_mistral-7b
model_repo: mistralai/Mistral-7B-Instruct-v0.3
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
precision: float16
- model: Mixtral MoE 8x7B FP8
mad_tag: pyt_vllm_mixtral-8x7b_fp8
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
precision: float8
- model: Mixtral MoE 8x22B FP8
mad_tag: pyt_vllm_mixtral-8x22b_fp8
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
precision: float8
- model: Mistral 7B FP8
mad_tag: pyt_vllm_mistral-7b_fp8
model_repo: amd/Mistral-7B-v0.1-FP8-KV
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
precision: float8
- group: Qwen
tag: qwen
models:
- model: Qwen2 7B
mad_tag: pyt_vllm_qwen2-7b
model_repo: Qwen/Qwen2-7B-Instruct
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
precision: float16
- model: Qwen2 72B
mad_tag: pyt_vllm_qwen2-72b
model_repo: Qwen/Qwen2-72B-Instruct
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
precision: float16
- model: QwQ-32B
mad_tag: pyt_vllm_qwq-32b
model_repo: Qwen/QwQ-32B
url: https://huggingface.co/Qwen/QwQ-32B
precision: float16
tunableop: true
- group: Databricks DBRX
tag: dbrx
models:
- model: DBRX Instruct
mad_tag: pyt_vllm_dbrx-instruct
model_repo: databricks/dbrx-instruct
url: https://huggingface.co/databricks/dbrx-instruct
precision: float16
- model: DBRX Instruct FP8
mad_tag: pyt_vllm_dbrx_fp8
model_repo: amd/dbrx-instruct-FP8-KV
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
precision: float8
- group: Google Gemma
tag: gemma
models:
- model: Gemma 2 27B
mad_tag: pyt_vllm_gemma-2-27b
model_repo: google/gemma-2-27b
url: https://huggingface.co/google/gemma-2-27b
precision: float16
- group: Cohere
tag: cohere
models:
- model: C4AI Command R+ 08-2024
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
precision: float16
- model: C4AI Command R+ 08-2024 FP8
mad_tag: pyt_vllm_command-r-plus_fp8
model_repo: amd/c4ai-command-r-plus-FP8-KV
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
precision: float8
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek MoE 16B
mad_tag: pyt_vllm_deepseek-moe-16b-chat
model_repo: deepseek-ai/deepseek-moe-16b-chat
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
precision: float16
- group: Microsoft Phi
tag: phi
models:
- model: Phi-4
mad_tag: pyt_vllm_phi-4
model_repo: microsoft/phi-4
url: https://huggingface.co/microsoft/phi-4
- group: TII Falcon
tag: falcon
models:
- model: Falcon 180B
mad_tag: pyt_vllm_falcon-180b
model_repo: tiiuae/falcon-180B
url: https://huggingface.co/tiiuae/falcon-180B
precision: float16

View File

@@ -2,10 +2,10 @@ vllm_benchmark:
unified_docker:
latest:
# TODO: update me
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
rocm_version: 6.4.1
vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
pytorch_version: 2.7.0+gitf717b2a
hipblaslt_version: 0.15
model_groups:

View File

@@ -0,0 +1,353 @@
:orphan:
.. meta::
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
ROCm vLLM Docker image.
:keywords: model, MAD, automation, dashboarding, validate
**********************************
vLLM inference performance testing
**********************************
.. caution::
This documentation does not reflect the latest version of ROCm vLLM
inference performance documentation. See :doc:`../vllm` for the latest version.
.. _vllm-benchmark-unified-docker:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
a prebuilt, optimized environment for validating large language model (LLM)
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
accelerators and includes the following components:
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
MI300X series accelerators.
.. _vllm-benchmark-available-models-20250702:
Supported models
================
The following models are supported for inference performance benchmarking
with vLLM and ROCm. Some instructions, commands, and recommendations in this
documentation might vary by model -- select one to get started.
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model group</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. _vllm-benchmark-vllm:
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. note::
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
Some models require access authorization prior to use via an external license agreement through a third party.
{% endfor %}
{% endfor %}
.. note::
vLLM is a toolkit and library for LLM inference and serving. AMD implements
high-performance custom kernels and modules in vLLM to enhance performance.
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
more information.
.. _vllm-benchmark-performance-measurements-20250702:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
page provides reference throughput and latency measurements for inferencing popular AI models.
.. important::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
only reflects the latest version of this inference benchmarking environment.
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
Advanced features and known issues
==================================
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
.. code-block:: shell
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Pull the Docker image
=====================
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
Benchmarking
============
Once the setup is complete, choose between two options to reproduce the
benchmark results:
.. _vllm-benchmark-mad:
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
Although the :ref:`available models <vllm-benchmark-available-models-20250702>` are preconfigured
to collect latency and throughput performance data, you can also change the benchmarking
parameters. See the standalone benchmarking tab for more information.
{% if model.tunableop %}
.. note::
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
TunableOp automatically explores different implementations and configurations of certain PyTorch
operators to find the fastest one for your hardware.
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
(see
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
enable it, edit the default run behavior in the ``models.json``
configuration before running inference -- update the model's run
``args`` by changing ``--tunableop off`` to ``--tunableop on``.
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
{% endif %}
.. tab-item:: Standalone benchmarking
Run the vLLM benchmark tool independently by starting the
`Docker container <{{ unified_docker.docker_hub_url }}>`_
as shown in the following snippet.
.. code-block::
docker pull {{ unified_docker.pull_tag }}
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``~/MAD/scripts/vllm``.
.. code-block::
git clone https://github.com/ROCm/MAD
cd MAD/scripts/vllm
To start the benchmark, use the following command with the appropriate options.
.. code-block::
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
.. list-table::
:header-rows: 1
:align: center
* - Name
- Options
- Description
* - ``$test_option``
- latency
- Measure decoding token latency
* -
- throughput
- Measure token generation throughput
* -
- all
- Measure both throughput and latency
* - ``$num_gpu``
- 1 or 8
- Number of GPUs
* - ``$datatype``
- ``float16`` or ``float8``
- Data type
.. note::
The input sequence length, output sequence length, and tensor parallel (TP) are
already configured. You don't need to specify them with this script.
.. note::
If you encounter the following error, pass your access-authorized Hugging
Face token to the gated models.
.. code-block::
OSError: You are trying to access a gated repo.
# pass your HF_TOKEN
export HF_TOKEN=$your_personal_hf_token
Here are some examples of running the benchmark with various options.
* Latency benchmark
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal`{{model.precision}}` precision.
.. code-block::
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
* Throughput benchmark
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
.. code-block:: shell
./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
.. raw:: html
<style>
mjx-container[jax="CHTML"][display="true"] {
text-align: left;
margin: 0;
}
</style>
.. note::
Throughput is calculated as:
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
{% endfor %}
{% endfor %}
Further reading
===============
- To learn more about the options for latency and throughput benchmark scripts,
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
- To learn more about system settings and management practices to configure your system for
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
- For application performance optimization strategies for HPC and AI workloads,
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
- To learn how to run community models from Hugging Face on AMD GPUs, see
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
- To learn how to fine-tune LLMs and optimize inference, see
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================
See :doc:`vllm-history` to find documentation for previous releases
of the ``ROCm/vllm`` Docker image.

View File

@@ -11,72 +11,98 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
.. list-table::
:header-rows: 1
:stub-columns: 1
* - ROCm version
- vLLM version
- PyTorch version
* - Docker image tag
- Components
- Resources
* - 6.4.1
- 0.9.1
- 2.7.0
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
-
* ROCm 6.4.1
* vLLM 0.9.1
* PyTorch 2.7.0
-
* :doc:`Documentation <../vllm>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
-
* ROCm 6.4.1
* vLLM 0.9.1
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.9.1-20250702>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__
* - 6.4.1
- 0.9.0.1
- 2.7.0
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
-
* ROCm 6.4.1
* vLLM 0.9.0.1
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.9.0.1-20250605>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`__
* - 6.3.1
- 0.8.5 (0.8.6.dev)
- 2.7.0
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
-
* ROCm 6.3.1
* 0.8.5 vLLM (0.8.6.dev)
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.8.5-20250521>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__
* - 6.3.1
- 0.8.5
- 2.7.0
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
-
* ROCm 6.3.1
* vLLM 0.8.5
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.8.5-20250513>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__
* - 6.3.1
- 0.8.3
- 2.7.0
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
-
* ROCm 6.3.1
* vLLM 0.8.3
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.8.3-20250415>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__
* - 6.3.1
- 0.7.3
- 2.7.0
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
-
* ROCm 6.3.1
* vLLM 0.7.3
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.7.3-20250325>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__
* - 6.3.1
- 0.6.6
- 2.7.0
* - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
-
* ROCm 6.3.1
* vLLM 0.6.6
* PyTorch 2.7.0
-
* :doc:`Documentation <vllm-0.6.6>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__
* - 6.2.1
- 0.6.4
- 2.5.0
* - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
-
* ROCm 6.2.1
* vLLM 0.6.4
* PyTorch 2.5.0
-
* :doc:`Documentation <vllm-0.6.4>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__
* - 6.2.0
- 0.4.3
- 2.4.0
* - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
-
* ROCm 6.2.0
* vLLM 0.4.3
* PyTorch 2.4.0
-
* :doc:`Documentation <vllm-0.4.3>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__

View File

@@ -20,23 +20,55 @@ vLLM inference performance testing
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
accelerators and includes the following components:
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
.. list-table::
:header-rows: 1
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
* - Software component
- Version
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
* - `ROCm <https://github.com/ROCm/ROCm>`__
- {{ unified_docker.rocm_version }}
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
* - `vLLM <https://docs.vllm.ai/en/latest>`__
- {{ unified_docker.vllm_version }}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements>` for
MI300X series accelerators.
* - `PyTorch <https://github.com/ROCm/pytorch>`__
- {{ unified_docker.pytorch_version }}
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
- {{ unified_docker.hipblaslt_version }}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements>` for
MI300X series accelerators.
What's new
==========
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
This parameter has been removed from the benchmarking script.
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
This parameter has been removed from the benchmarking script.
* Fixed a ``+rms_norm`` custom kernel issue.
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
Supported models
================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
.. _vllm-benchmark-available-models:
Supported models
================
The following models are supported for inference performance benchmarking
with vLLM and ROCm. Some instructions, commands, and recommendations in this
documentation might vary by model -- select one to get started.
@@ -85,56 +117,48 @@ vLLM inference performance testing
{% endfor %}
{% endfor %}
.. note::
.. note::
vLLM is a toolkit and library for LLM inference and serving. AMD implements
high-performance custom kernels and modules in vLLM to enhance performance.
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
more information.
vLLM is a toolkit and library for LLM inference and serving. AMD implements
high-performance custom kernels and modules in vLLM to enhance performance.
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
more information.
.. _vllm-benchmark-performance-measurements:
.. _vllm-benchmark-performance-measurements:
Performance measurements
========================
Performance measurements
========================
To evaluate performance, the
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
page provides reference throughput and latency measurements for inferencing popular AI models.
.. important::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
page provides reference throughput and latency measurements for inferencing popular AI models.
only reflects the latest version of this inference benchmarking environment.
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
.. important::
System validation
=================
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
only reflects the latest version of this inference benchmarking environment.
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
Advanced features and known issues
==================================
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
System validation
=================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
might hang until the periodic balancing is finalized. For more information,
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
.. code-block:: shell
# disable automatic NUMA balancing
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
# check if NUMA balancing is disabled (returns 0 if disabled)
cat /proc/sys/kernel/numa_balancing
0
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
Pull the Docker image
=====================
@@ -163,22 +187,26 @@ vLLM inference performance testing
.. tab-item:: MAD-integrated benchmarking
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
.. code-block:: shell
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py \
--tags {{model.mad_tag}} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
@@ -209,75 +237,99 @@ vLLM inference performance testing
.. tab-item:: Standalone benchmarking
Run the vLLM benchmark tool independently by starting the
`Docker container <{{ unified_docker.docker_hub_url }}>`_
as shown in the following snippet.
.. rubric:: Download the Docker image and required scripts
.. code-block::
1. Run the vLLM benchmark tool independently by starting the
`Docker container <{{ unified_docker.docker_hub_url }}>`_
as shown in the following snippet.
docker pull {{ unified_docker.pull_tag }}
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
.. code-block:: shell
In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``~/MAD/scripts/vllm``.
docker pull {{ unified_docker.pull_tag }}
docker run -it \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--shm-size 16G \
--security-opt seccomp=unconfined \
--security-opt apparmor=unconfined \
--cap-add=SYS_PTRACE \
-v $(pwd):/workspace \
--env HUGGINGFACE_HUB_CACHE=/workspace \
--name test \
{{ unified_docker.pull_tag }}
.. code-block::
2. In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``~/MAD/scripts/vllm``.
git clone https://github.com/ROCm/MAD
cd MAD/scripts/vllm
.. code-block:: shell
To start the benchmark, use the following command with the appropriate options.
git clone https://github.com/ROCm/MAD
cd MAD/scripts/vllm
.. code-block::
3. To start the benchmark, use the following command with the appropriate options.
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
.. dropdown:: Benchmark options
:open:
.. list-table::
:header-rows: 1
:align: center
.. list-table::
:header-rows: 1
:align: center
* - Name
- Options
- Description
* - Name
- Options
- Description
* - ``$test_option``
- latency
- Measure decoding token latency
* - ``$test_option``
- latency
- Measure decoding token latency
* -
- throughput
- Measure token generation throughput
* -
- throughput
- Measure token generation throughput
* -
- all
- Measure both throughput and latency
* -
- all
- Measure both throughput and latency
* - ``$num_gpu``
- 1 or 8
- Number of GPUs
* - ``$num_gpu``
- 1 or 8
- Number of GPUs
* - ``$datatype``
- ``float16`` or ``float8``
- Data type
* - ``$datatype``
- ``float16`` or ``float8``
- Data type
.. note::
The input sequence length, output sequence length, and tensor parallel (TP) are
already configured. You don't need to specify them with this script.
The input sequence length, output sequence length, and tensor parallel (TP) are
already configured. You don't need to specify them with this script.
.. note::
If you encounter the following error, pass your access-authorized Hugging
Face token to the gated models.
Command:
.. code-block::
OSError: You are trying to access a gated repo.
./vllm_benchmark_report.sh \
-s $test_option \
-m {{model.model_repo}} \
-g $num_gpu \
-d {{model.precision}}
# pass your HF_TOKEN
export HF_TOKEN=$your_personal_hf_token
.. note::
Here are some examples of running the benchmark with various options.
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
If you encounter the following error, pass your access-authorized Hugging
Face token to the gated models.
.. code-block::
OSError: You are trying to access a gated repo.
# pass your HF_TOKEN
export HF_TOKEN=$your_personal_hf_token
.. rubric:: Benchmarking examples
Here are some examples of running the benchmark with various options:
* Latency benchmark
@@ -285,7 +337,11 @@ vLLM inference performance testing
.. code-block::
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
./vllm_benchmark_report.sh \
-s latency \
-m {{model.model_repo}} \
-g 8 \
-d {{model.precision}}
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
@@ -295,7 +351,11 @@ vLLM inference performance testing
.. code-block:: shell
./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
./vllm_benchmark_report.sh \
-s throughput \
-m {{model.model_repo}} \
-g 8 \
-d {{model.precision}}
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
@@ -318,6 +378,41 @@ vLLM inference performance testing
{% endfor %}
{% endfor %}
Advanced usage
==============
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
Reproducing the Docker image
----------------------------
To reproduce this ROCm/vLLM Docker image release, follow these steps:
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
.. code-block:: shell
git clone https://github.com/ROCm/vllm.git
2. Checkout the specific release commit.
.. code-block:: shell
cd vllm
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
.. code-block:: shell
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
Known issues and workarounds
============================
AITER does not support FP8 KV cache yet.
Further reading
===============