From 1b4f25733d45e40f8294f36b30805b69e0b3e9f8 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Wed, 17 Dec 2025 09:21:57 -0500 Subject: [PATCH] vLLM inference benchmark 1210 (#5776) * Archive previous ver fix anchors * Update vllm.rst and data yaml for 20251210 --- ...vllm_0.11.1_20251103-benchmark-models.yaml | 316 ++++++++++++ .../inference/vllm-benchmark-models.yaml | 8 +- .../vllm-0.11.1-20251103.rst | 472 ++++++++++++++++++ .../previous-versions/vllm-history.rst | 16 +- .../inference/benchmark-docker/vllm.rst | 46 +- 5 files changed, 834 insertions(+), 24 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml new file mode 100644 index 000000000..e25149c0b --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml @@ -0,0 +1,316 @@ +dockers: + - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506 + components: + ROCm: 7.0.0 + vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700) + PyTorch: 2.9.0a0+git1c57644 + hipBLASLt: 1.0.0 + dockerfile: + commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 4096 + max_model_len: 4096 + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B MXFP4 + mad_tag: pyt_vllm_llama-3.1-405b_fp4 + model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview + precision: float4 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B + mad_tag: pyt_vllm_llama-3.3-70b + model_repo: meta-llama/Llama-3.3-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B FP8 + mad_tag: pyt_vllm_llama-3.3-70b_fp8 + model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B MXFP4 + mad_tag: pyt_vllm_llama-3.3-70b_fp4 + model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview + url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview + precision: float4 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 4 Scout 17Bx16E + mad_tag: pyt_vllm_llama-4-scout-17b-16e + model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct + url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Llama 4 Maverick 17Bx128E + mad_tag: pyt_vllm_llama-4-maverick-17b-128e + model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct + url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Llama 4 Maverick 17Bx128E FP8 + mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8 + model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek R1 0528 FP8 + mad_tag: pyt_vllm_deepseek-r1 + model_repo: deepseek-ai/DeepSeek-R1-0528 + url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_seqs: 1024 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: OpenAI GPT OSS + tag: gpt-oss + models: + - model: GPT OSS 20B + mad_tag: pyt_vllm_gpt-oss-20b + model_repo: openai/gpt-oss-20b + url: https://huggingface.co/openai/gpt-oss-20b + precision: bfloat16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 8192 + max_model_len: 8192 + - model: GPT OSS 120B + mad_tag: pyt_vllm_gpt-oss-120b + model_repo: openai/gpt-oss-120b + url: https://huggingface.co/openai/gpt-oss-120b + precision: bfloat16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 8192 + max_model_len: 8192 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 65536 + max_model_len: 8192 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 65536 + max_model_len: 8192 + - group: Qwen + tag: qwen + models: + - model: Qwen3 8B + mad_tag: pyt_vllm_qwen3-8b + model_repo: Qwen/Qwen3-8B + url: https://huggingface.co/Qwen/Qwen3-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 32B + mad_tag: pyt_vllm_qwen3-32b + model_repo: Qwen/Qwen3-32b + url: https://huggingface.co/Qwen/Qwen3-32B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 30B A3B + mad_tag: pyt_vllm_qwen3-30b-a3b + model_repo: Qwen/Qwen3-30B-A3B + url: https://huggingface.co/Qwen/Qwen3-30B-A3B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 30B A3B FP8 + mad_tag: pyt_vllm_qwen3-30b-a3b_fp8 + model_repo: Qwen/Qwen3-30B-A3B-FP8 + url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8 + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 235B A22B + mad_tag: pyt_vllm_qwen3-235b-a22b + model_repo: Qwen/Qwen3-235B-A22B + url: https://huggingface.co/Qwen/Qwen3-235B-A22B + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 235B A22B FP8 + mad_tag: pyt_vllm_qwen3-235b-a22b_fp8 + model_repo: Qwen/Qwen3-235B-A22B-FP8 + url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 40960 + max_model_len: 8192 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 16384 + max_model_len: 8192 diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml index e25149c0b..189644efc 100644 --- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -1,13 +1,13 @@ dockers: - - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103 - docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506 + - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7 components: ROCm: 7.0.0 - vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700) + vLLM: 0.11.2 (0.11.2.dev673+g839868462.rocm700) PyTorch: 2.9.0a0+git1c57644 hipBLASLt: 1.0.0 dockerfile: - commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72 + commit: 8398684622109c806a35d660647060b0b9910663 model_groups: - group: Meta Llama tag: llama diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst new file mode 100644 index 000000000..29378f55a --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst @@ -0,0 +1,472 @@ +:orphan: + +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image. + :keywords: model, MAD, automation, dashboarding, validate + +********************************** +vLLM inference performance testing +********************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See :doc:`../vllm` for the latest version. + +.. _vllm-benchmark-unified-docker-1103: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + + The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a + prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X + GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored + specifically for AMD data center GPUs and includes the following components: + + .. tab-set:: + + .. tab-item:: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +With this Docker image, you can quickly test the :ref:`expected +inference performance numbers ` for +AMD Instinct GPUs. + +What's new +========== + +The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. + +* Enabled :ref:`AITER ` by default. + +* Fixed ``rms_norm`` segfault issue with Qwen 3 235B. + +* Known performance degradation on Llama 4 models due to `an upstream vLLM issue `_. + +.. _vllm-benchmark-supported-models-1103: + +Supported models +================ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} + + .. _vllm-benchmark-available-models-1103: + + The following models are supported for inference performance benchmarking + with vLLM and ROCm. Some instructions, commands, and recommendations in this + documentation might vary by model -- select one to get started. MXFP4 models + are only supported on MI355X and MI350X GPUs. + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. _vllm-benchmark-vllm-1103: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + + {% if model.precision == "float4" %} + .. important:: + + MXFP4 is supported only on MI355X and MI350X GPUs. + {% endif %} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization prior to use via an external license agreement through a third party. + {% if model.precision == "float8" and model.model_repo.startswith("amd") %} + This model uses FP8 quantization via `AMD Quark `__ for efficient inference on AMD GPUs. + {% endif %} + {% if model.precision == "float4" and model.model_repo.startswith("amd") %} + This model uses FP4 quantization via `AMD Quark `__ for efficient inference on AMD GPUs. + {% endif %} + + {% endfor %} + {% endfor %} + +.. _vllm-benchmark-performance-measurements-1103: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and serving measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + + Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_. + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Benchmarking +============ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} + + Once the setup is complete, choose between two options to reproduce the + benchmark results: + + .. _vllm-benchmark-mad-1103: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + The following run command is tailored to {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node with the + :literal:`{{model.precision}}` data type. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + Although the :ref:`available models + ` are preconfigured to collect + offline throughput and online serving performance data, you can + also change the benchmarking parameters. See the standalone + benchmarking tab for more information. + + {% if model.tunableop %} + + .. note:: + + For improved performance, consider enabling :ref:`PyTorch TunableOp `. + TunableOp automatically explores different implementations and configurations of certain PyTorch + operators to find the fastest one for your hardware. + + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see + ``__). To enable it, include + the ``--tunableop on`` argument in your run. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the + performance-collection run. + + {% endif %} + + .. tab-item:: Standalone benchmarking + + The following commands are optimized for {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model. + + .. seealso:: + + For more information on configuration, see the `config files + `__ + in the MAD repository. Refer to the `vLLM engine `__ + for descriptions of available configuration options + and `Benchmarking vLLM `__ for + additional benchmarking information. + + .. rubric:: Launch the container + + You can run the vLLM benchmark tool independently by starting the + `Docker container <{{ docker.docker_hub_url }}>`_ as shown + in the following snippet. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + docker run -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --shm-size 16G \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --cap-add=SYS_PTRACE \ + -v $(pwd):/workspace \ + --env HUGGINGFACE_HUB_CACHE=/workspace \ + --name test \ + {{ docker.pull_tag }} + + .. rubric:: Throughput command + + Use the following command to start the throughput benchmark. + + .. code-block:: shell + + model={{ model.model_repo }} + tp={{ model.config.tp }} + num_prompts={{ model.config.num_prompts | default(1024) }} + in={{ model.config.in | default(128) }} + out={{ model.config.in | default(128) }} + dtype={{ model.config.dtype | default("auto") }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs={{ model.config.max_num_seqs | default(1024) }} + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} + + vllm bench throughput --model $model \ + -tp $tp \ + --num-prompts $num_prompts \ + --input-len $in \ + --output-len $out \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --trust-remote-code \ + --output-json ${model}_throughput.json \ + --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }} + + .. rubric:: Serving command + + 1. Start the server using the following command: + + .. code-block:: shell + + model={{ model.model_repo }} + tp={{ model.config.tp }} + dtype={{ model.config.dtype }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs=256 + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} + + vllm serve $model \ + -tp $tp \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --no-enable-prefix-caching \ + --swap-space 16 \ + --disable-log-requests \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 + + Wait until the model has loaded and the server is ready to accept requests. + + 2. On another terminal on the same machine, run the benchmark: + + .. code-block:: shell + + # Connect to the container + docker exec -it test bash + + # Wait for the server to start + until curl -s http://localhost:8000/v1/models; do sleep 30; done + + # Run the benchmark + model={{ model.model_repo }} + max_concurrency=1 + num_prompts=10 + in=128 + out=128 + vllm bench serve --model $model \ + --percentile-metrics "ttft,tpot,itl,e2el" \ + --dataset-name random \ + --ignore-eos \ + --max-concurrency $max_concurrency \ + --num-prompts $num_prompts \ + --random-input-len $in \ + --random-output-len $out \ + --trust-remote-code \ + --save-result \ + --result-filename ${model}_serving.json + + .. note:: + + For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B, + try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands. + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. + + .. code-block:: + + OSError: You are trying to access a gated repo. + + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token + + .. raw:: html + + + + .. note:: + + Throughput is calculated as: + + - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time + + - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time + {% endfor %} + {% endfor %} + +Advanced usage +============== + +For information on experimental features and known issues related to ROCm optimization efforts on vLLM, +see the developer's guide at ``__. + +.. note:: + + If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures. + +Reproducing the Docker image +---------------------------- + +To reproduce this ROCm-enabled vLLM Docker image release, follow these steps: + +1. Clone the `vLLM repository `__. + + .. code-block:: shell + + git clone https://github.com/vllm-project/vllm.git + cd vllm + +2. Use the following command to build the image directly from the specified commit. + + .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + .. code-block:: shell + + docker build -f docker/Dockerfile.rocm \ + --build-arg REMOTE_VLLM=1 \ + --build-arg VLLM_REPO=https://github.com/ROCm/vllm \ + --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \ + -t vllm-rocm . + + .. tip:: + + Replace ``vllm-rocm`` with your desired image tag. + +Further reading +=============== + +- To learn more about the options for latency and throughput benchmark scripts, + see ``_. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + a brief introduction to vLLM and optimization strategies. + +- For application performance optimization strategies for HPC and AI workloads, + including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`vllm-history` to find documentation for previous releases +of the ``ROCm/vllm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index 9ee280d6c..c1e50d7e3 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -16,15 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub ` + * `Docker Hub `__ + + * - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103`` - * ROCm 7.0.0 * vLLM 0.11.1 * PyTorch 2.9.0 - - * :doc:`Documentation <../vllm>` - * `Docker Hub `__ + * :doc:`Documentation ` + * `Docker Hub `__ * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006`` - diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 2258c3417..2d5dca5c4 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -6,7 +6,7 @@ vLLM inference performance testing ********************************** -.. _vllm-benchmark-unified-docker-1024: +.. _vllm-benchmark-unified-docker-1210: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -34,21 +34,18 @@ vLLM inference performance testing {% endfor %} With this Docker image, you can quickly test the :ref:`expected -inference performance numbers ` for +inference performance numbers ` for AMD Instinct GPUs. What's new ========== -The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. +The following is summary of notable changes since the :doc:`previous ROCm/vLLM +Docker release `. -* Enabled :ref:`AITER ` by default. +- Improved performance on Llama 3 MXFP4 through AITER optimizations and improved kernel fusion. -* Fixed ``rms_norm`` segfault issue with Qwen 3 235B. - -* Known performance degradation on Llama 4 models due to `an upstream vLLM issue `_. - -.. _vllm-benchmark-supported-models-1024: +.. _vllm-benchmark-supported-models-1210: Supported models ================ @@ -58,7 +55,7 @@ Supported models {% set docker = data.dockers[0] %} {% set model_groups = data.model_groups %} - .. _vllm-benchmark-available-models-1024: + .. _vllm-benchmark-available-models-1210: The following models are supported for inference performance benchmarking with vLLM and ROCm. Some instructions, commands, and recommendations in this @@ -94,7 +91,7 @@ Supported models - .. _vllm-benchmark-vllm-1024: + .. _vllm-benchmark-vllm-1210: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -108,6 +105,15 @@ Supported models MXFP4 is supported only on MI355X and MI350X GPUs. {% endif %} + {% if model.mad_tag in ["pyt_vllm_mixtral-8x7b", "pyt_vllm_mixtral-8x7b_fp8", "pyt_vllm_mixtral-8x22b", "pyt_vllm_mixtral-8x22b_fp8", "pyt_vllm_deepseek-r1"] %} + .. caution:: + + There is a known regression with AITER for MoE models such as Mixtral and + DeepSeek-R1. Consider using the :doc:`previous release + ` + ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103`` for better performance. + {% endif %} + .. note:: See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. @@ -122,7 +128,7 @@ Supported models {% endfor %} {% endfor %} -.. _vllm-benchmark-performance-measurements-1024: +.. _vllm-benchmark-performance-measurements-1210: Performance measurements ======================== @@ -178,7 +184,7 @@ Benchmarking Once the setup is complete, choose between two options to reproduce the benchmark results: - .. _vllm-benchmark-mad-1024: + .. _vllm-benchmark-mad-1210: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -190,7 +196,7 @@ Benchmarking .. tab-item:: MAD-integrated benchmarking The following run command is tailored to {{ model.model }}. - See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model. + See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -219,7 +225,7 @@ Benchmarking and ``{{ model.mad_tag }}_serving.csv``. Although the :ref:`available models - ` are preconfigured to collect + ` are preconfigured to collect offline throughput and online serving performance data, you can also change the benchmarking parameters. See the standalone benchmarking tab for more information. @@ -244,7 +250,7 @@ Benchmarking .. tab-item:: Standalone benchmarking The following commands are optimized for {{ model.model }}. - See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model. + See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model. .. seealso:: @@ -438,6 +444,14 @@ To reproduce this ROCm-enabled vLLM Docker image release, follow these steps: Replace ``vllm-rocm`` with your desired image tag. +Known issues +============ + +There is a known regression with AITER for MoE models such as Mixtral and +DeepSeek-R1. Consider using the :doc:`previous release +` +(``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``) for better performance. + Further reading ===============