From 7ee22790ce2889c3e310a40988e0e0fe721a6fe9 Mon Sep 17 00:00:00 2001 From: Peter Park Date: Thu, 14 Aug 2025 15:43:36 -0400 Subject: [PATCH] docs: Update vLLM benchmark doc for 20250812 Docker release (#5196) --- .wordlist.txt | 1 + docs/conf.py | 2 + .../vllm_0.9.1_20250715-benchmark_models.yaml | 163 +++++++ .../inference/vllm-benchmark-models.yaml | 83 +--- .../previous-versions/vllm-0.9.1-20250702.rst | 6 +- .../previous-versions/vllm-0.9.1-20250715.rst | 450 ++++++++++++++++++ .../previous-versions/vllm-history.rst | 13 +- .../inference/benchmark-docker/vllm.rst | 149 +++--- 8 files changed, 706 insertions(+), 161 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst diff --git a/.wordlist.txt b/.wordlist.txt index c32752d7c..7b592fc91 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -5,6 +5,7 @@ ACEs ACS AccVGPR AccVGPRs +AITER ALU AllReduce AMD diff --git a/docs/conf.py b/docs/conf.py index c4753c4b7..17ed6810c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -147,6 +147,8 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml new file mode 100644 index 000000000..5682828ce --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml @@ -0,0 +1,163 @@ +vllm_benchmark: + unified_docker: + latest: + # TODO: update me + pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea + rocm_version: 6.4.1 + vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641) + pytorch_version: 2.7.0+gitf717b2a + hipblaslt_version: 0.15 + model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + - model: Llama 3.1 70B + mad_tag: pyt_vllm_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: float16 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + - model: Llama 2 7B + mad_tag: pyt_vllm_llama-2-7b + model_repo: meta-llama/Llama-2-7b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf + precision: float16 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 70B FP8 + mad_tag: pyt_vllm_llama-3.1-70b_fp8 + model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + - model: Mistral 7B + mad_tag: pyt_vllm_mistral-7b + model_repo: mistralai/Mistral-7B-Instruct-v0.3 + url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 + precision: float16 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + - model: Mistral 7B FP8 + mad_tag: pyt_vllm_mistral-7b_fp8 + model_repo: amd/Mistral-7B-v0.1-FP8-KV + url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV + precision: float8 + - group: Qwen + tag: qwen + models: + - model: Qwen2 7B + mad_tag: pyt_vllm_qwen2-7b + model_repo: Qwen/Qwen2-7B-Instruct + url: https://huggingface.co/Qwen/Qwen2-7B-Instruct + precision: float16 + - model: Qwen2 72B + mad_tag: pyt_vllm_qwen2-72b + model_repo: Qwen/Qwen2-72B-Instruct + url: https://huggingface.co/Qwen/Qwen2-72B-Instruct + precision: float16 + - model: QwQ-32B + mad_tag: pyt_vllm_qwq-32b + model_repo: Qwen/QwQ-32B + url: https://huggingface.co/Qwen/QwQ-32B + precision: float16 + tunableop: true + - group: Databricks DBRX + tag: dbrx + models: + - model: DBRX Instruct + mad_tag: pyt_vllm_dbrx-instruct + model_repo: databricks/dbrx-instruct + url: https://huggingface.co/databricks/dbrx-instruct + precision: float16 + - model: DBRX Instruct FP8 + mad_tag: pyt_vllm_dbrx_fp8 + model_repo: amd/dbrx-instruct-FP8-KV + url: https://huggingface.co/amd/dbrx-instruct-FP8-KV + precision: float8 + - group: Google Gemma + tag: gemma + models: + - model: Gemma 2 27B + mad_tag: pyt_vllm_gemma-2-27b + model_repo: google/gemma-2-27b + url: https://huggingface.co/google/gemma-2-27b + precision: float16 + - group: Cohere + tag: cohere + models: + - model: C4AI Command R+ 08-2024 + mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024 + model_repo: CohereForAI/c4ai-command-r-plus-08-2024 + url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024 + precision: float16 + - model: C4AI Command R+ 08-2024 FP8 + mad_tag: pyt_vllm_command-r-plus_fp8 + model_repo: amd/c4ai-command-r-plus-FP8-KV + url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV + precision: float8 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek MoE 16B + mad_tag: pyt_vllm_deepseek-moe-16b-chat + model_repo: deepseek-ai/deepseek-moe-16b-chat + url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat + precision: float16 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + - group: TII Falcon + tag: falcon + models: + - model: Falcon 180B + mad_tag: pyt_vllm_falcon-180b + model_repo: tiiuae/falcon-180B + url: https://huggingface.co/tiiuae/falcon-180B + precision: float16 diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml index 5682828ce..5c3b1b51e 100644 --- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -2,11 +2,11 @@ vllm_benchmark: unified_docker: latest: # TODO: update me - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715 - docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea + pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812 + docker_hub_url: rocm_version: 6.4.1 - vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641) - pytorch_version: 2.7.0+gitf717b2a + vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641) + pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a) hipblaslt_version: 0.15 model_groups: - group: Meta Llama @@ -27,11 +27,6 @@ vllm_benchmark: model_repo: meta-llama/Llama-3.1-405B-Instruct url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct precision: float16 - - model: Llama 2 7B - mad_tag: pyt_vllm_llama-2-7b - model_repo: meta-llama/Llama-2-7b-chat-hf - url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf - precision: float16 - model: Llama 2 70B mad_tag: pyt_vllm_llama-2-70b model_repo: meta-llama/Llama-2-70b-chat-hf @@ -65,11 +60,6 @@ vllm_benchmark: model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 precision: float16 - - model: Mistral 7B - mad_tag: pyt_vllm_mistral-7b - model_repo: mistralai/Mistral-7B-Instruct-v0.3 - url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 - precision: float16 - model: Mixtral MoE 8x7B FP8 mad_tag: pyt_vllm_mixtral-8x7b_fp8 model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV @@ -80,72 +70,15 @@ vllm_benchmark: model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV precision: float8 - - model: Mistral 7B FP8 - mad_tag: pyt_vllm_mistral-7b_fp8 - model_repo: amd/Mistral-7B-v0.1-FP8-KV - url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV - precision: float8 - group: Qwen tag: qwen models: - - model: Qwen2 7B - mad_tag: pyt_vllm_qwen2-7b - model_repo: Qwen/Qwen2-7B-Instruct - url: https://huggingface.co/Qwen/Qwen2-7B-Instruct - precision: float16 - - model: Qwen2 72B - mad_tag: pyt_vllm_qwen2-72b - model_repo: Qwen/Qwen2-72B-Instruct - url: https://huggingface.co/Qwen/Qwen2-72B-Instruct - precision: float16 - model: QwQ-32B mad_tag: pyt_vllm_qwq-32b model_repo: Qwen/QwQ-32B url: https://huggingface.co/Qwen/QwQ-32B precision: float16 tunableop: true - - group: Databricks DBRX - tag: dbrx - models: - - model: DBRX Instruct - mad_tag: pyt_vllm_dbrx-instruct - model_repo: databricks/dbrx-instruct - url: https://huggingface.co/databricks/dbrx-instruct - precision: float16 - - model: DBRX Instruct FP8 - mad_tag: pyt_vllm_dbrx_fp8 - model_repo: amd/dbrx-instruct-FP8-KV - url: https://huggingface.co/amd/dbrx-instruct-FP8-KV - precision: float8 - - group: Google Gemma - tag: gemma - models: - - model: Gemma 2 27B - mad_tag: pyt_vllm_gemma-2-27b - model_repo: google/gemma-2-27b - url: https://huggingface.co/google/gemma-2-27b - precision: float16 - - group: Cohere - tag: cohere - models: - - model: C4AI Command R+ 08-2024 - mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024 - model_repo: CohereForAI/c4ai-command-r-plus-08-2024 - url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024 - precision: float16 - - model: C4AI Command R+ 08-2024 FP8 - mad_tag: pyt_vllm_command-r-plus_fp8 - model_repo: amd/c4ai-command-r-plus-FP8-KV - url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV - precision: float8 - - group: DeepSeek - tag: deepseek - models: - - model: DeepSeek MoE 16B - mad_tag: pyt_vllm_deepseek-moe-16b-chat - model_repo: deepseek-ai/deepseek-moe-16b-chat - url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat - precision: float16 - group: Microsoft Phi tag: phi models: @@ -153,11 +86,3 @@ vllm_benchmark: mad_tag: pyt_vllm_phi-4 model_repo: microsoft/phi-4 url: https://huggingface.co/microsoft/phi-4 - - group: TII Falcon - tag: falcon - models: - - model: Falcon 180B - mad_tag: pyt_vllm_falcon-180b - model_repo: tiiuae/falcon-180B - url: https://huggingface.co/tiiuae/falcon-180B - precision: float16 diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst index 80cd9b9c2..a482c27c7 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst @@ -14,7 +14,7 @@ vLLM inference performance testing This documentation does not reflect the latest version of ROCm vLLM inference performance documentation. See :doc:`../vllm` for the latest version. -.. _vllm-benchmark-unified-docker: +.. _vllm-benchmark-unified-docker-702: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml @@ -77,7 +77,7 @@ vLLM inference performance testing - .. _vllm-benchmark-vllm: + .. _vllm-benchmark-vllm-702: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -159,7 +159,7 @@ vLLM inference performance testing Once the setup is complete, choose between two options to reproduce the benchmark results: - .. _vllm-benchmark-mad: + .. _vllm-benchmark-mad-702: {% for model_group in model_groups %} {% for model in model_group.models %} diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst new file mode 100644 index 000000000..f2850b09c --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst @@ -0,0 +1,450 @@ +:orphan: + +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the + ROCm vLLM Docker image. + :keywords: model, MAD, automation, dashboarding, validate + +********************************** +vLLM inference performance testing +********************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See :doc:`../vllm` for the latest version. + +.. _vllm-benchmark-unified-docker-715: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers + a prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM + Docker image integrates vLLM and PyTorch tailored specifically for MI300X series + accelerators and includes the following components: + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + * - `ROCm `__ + - {{ unified_docker.rocm_version }} + + * - `vLLM `__ + - {{ unified_docker.vllm_version }} + + * - `PyTorch `__ + - {{ unified_docker.pytorch_version }} + + * - `hipBLASLt `__ + - {{ unified_docker.hipblaslt_version }} + +With this Docker image, you can quickly test the :ref:`expected +inference performance numbers ` for +MI300X series accelerators. + +What's new +========== + +The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. + +* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default. + This parameter has been removed from the benchmarking script. + +* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``. + This parameter has been removed from the benchmarking script. + +* Fixed a ``+rms_norm`` custom kernel issue. + +* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``. + +* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix. + +Supported models +================ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + .. _vllm-benchmark-available-models-715: + + The following models are supported for inference performance benchmarking + with vLLM and ROCm. Some instructions, commands, and recommendations in this + documentation might vary by model -- select one to get started. + + .. raw:: html + +
+
+
Model group
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. _vllm-benchmark-vllm-715: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization prior to use via an external license agreement through a third party. + + {% endfor %} + {% endfor %} + +.. note:: + + vLLM is a toolkit and library for LLM inference and serving. AMD implements + high-performance custom kernels and modules in vLLM to enhance performance. + See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + more information. + +.. _vllm-benchmark-performance-measurements-715: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and latency measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + Pull the Docker image + ===================== + + Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_. + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + + Benchmarking + ============ + + Once the setup is complete, choose between two options to reproduce the + benchmark results: + + .. _vllm-benchmark-mad-715: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model + using one GPU with the :literal:`{{model.precision}}` data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the + model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``. + + Although the :ref:`available models ` are preconfigured + to collect latency and throughput performance data, you can also change the benchmarking + parameters. See the standalone benchmarking tab for more information. + + {% if model.tunableop %} + + .. note:: + + For improved performance, consider enabling :ref:`PyTorch TunableOp `. + TunableOp automatically explores different implementations and configurations of certain PyTorch + operators to find the fastest one for your hardware. + + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled + (see + ``__). + To enable it, include the ``--tunableop on`` argument in your + run. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed + by the performance-collection run. + + {% endif %} + + .. tab-item:: Standalone benchmarking + + .. rubric:: Download the Docker image and required scripts + + 1. Run the vLLM benchmark tool independently by starting the + `Docker container <{{ unified_docker.docker_hub_url }}>`_ + as shown in the following snippet. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + docker run -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --shm-size 16G \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --cap-add=SYS_PTRACE \ + -v $(pwd):/workspace \ + --env HUGGINGFACE_HUB_CACHE=/workspace \ + --name test \ + {{ unified_docker.pull_tag }} + + 2. In the Docker container, clone the ROCm MAD repository and navigate to the + benchmark scripts directory at ``~/MAD/scripts/vllm``. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/vllm + + 3. To start the benchmark, use the following command with the appropriate options. + + .. dropdown:: Benchmark options + :open: + + .. list-table:: + :header-rows: 1 + :align: center + + * - Name + - Options + - Description + + * - ``$test_option`` + - latency + - Measure decoding token latency + + * - + - throughput + - Measure token generation throughput + + * - + - all + - Measure both throughput and latency + + * - ``$num_gpu`` + - 1 or 8 + - Number of GPUs + + * - ``$datatype`` + - ``float16`` or ``float8`` + - Data type + + The input sequence length, output sequence length, and tensor parallel (TP) are + already configured. You don't need to specify them with this script. + + Command: + + .. code-block:: + + ./vllm_benchmark_report.sh \ + -s $test_option \ + -m {{model.model_repo}} \ + -g $num_gpu \ + -d {{model.precision}} + + .. note:: + + For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``. + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. + + .. code-block:: + + OSError: You are trying to access a gated repo. + + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token + + .. rubric:: Benchmarking examples + + Here are some examples of running the benchmark with various options: + + * Latency benchmark + + Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. + + .. code-block:: + + ./vllm_benchmark_report.sh \ + -s latency \ + -m {{model.model_repo}} \ + -g 8 \ + -d {{model.precision}} + + Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``. + + * Throughput benchmark + + Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. + + .. code-block:: shell + + ./vllm_benchmark_report.sh \ + -s throughput \ + -m {{model.model_repo}} \ + -g 8 \ + -d {{model.precision}} + + Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``. + + .. raw:: html + + + + .. note:: + + Throughput is calculated as: + + - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time + + - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time + {% endfor %} + {% endfor %} + +Advanced usage +============== + +For information on experimental features and known issues related to ROCm optimization efforts on vLLM, +see the developer's guide at ``__. + +Reproducing the Docker image +---------------------------- + +To reproduce this ROCm/vLLM Docker image release, follow these steps: + +1. Clone the `vLLM repository `__. + + .. code-block:: shell + + git clone https://github.com/ROCm/vllm.git + +2. Checkout the specific release commit. + + .. code-block:: shell + + cd vllm + git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f + +3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. + + .. code-block:: shell + + docker build -f docker/Dockerfile.rocm -t vllm-rocm . + +Known issues and workarounds +============================ + +AITER does not support FP8 KV cache yet. + +Further reading +=============== + +- To learn more about the options for latency and throughput benchmark scripts, + see ``_. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_. + +- For application performance optimization strategies for HPC and AI workloads, + including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`. + +- To learn how to run community models from Hugging Face on AMD GPUs, see + :doc:`Running models from Hugging Face `. + +- To learn how to fine-tune LLMs and optimize inference, see + :doc:`Fine-tuning LLMs and inference optimization `. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`vllm-history` to find documentation for previous releases +of the ``ROCm/vllm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index b26cc522a..6f87670ec 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub ` + * `Docker Hub `__ + + * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715`` - * ROCm 6.4.1 * vLLM 0.9.1 * PyTorch 2.7.0 - - * :doc:`Documentation <../vllm>` + * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702`` diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 58c5dc6bd..02c992620 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -7,7 +7,7 @@ vLLM inference performance testing ********************************** -.. _vllm-benchmark-unified-docker: +.. _vllm-benchmark-unified-docker-812: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -47,17 +47,11 @@ What's new The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. -* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default. - This parameter has been removed from the benchmarking script. +* Upgraded to vLLM v0.10. -* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``. - This parameter has been removed from the benchmarking script. +* FP8 KV cache support via AITER. -* Fixed a ``+rms_norm`` custom kernel issue. - -* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``. - -* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix. +* Full graph capture support via AITER. Supported models ================ @@ -67,7 +61,7 @@ Supported models {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} {% set model_groups = data.vllm_benchmark.model_groups %} - .. _vllm-benchmark-available-models: + .. _vllm-benchmark-available-models-812: The following models are supported for inference performance benchmarking with vLLM and ROCm. Some instructions, commands, and recommendations in this @@ -102,7 +96,7 @@ Supported models - .. _vllm-benchmark-vllm: + .. _vllm-benchmark-vllm-812: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -124,14 +118,14 @@ Supported models See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more information. -.. _vllm-benchmark-performance-measurements: +.. _vllm-benchmark-performance-measurements-812: Performance measurements ======================== To evaluate performance, the `Performance results with AMD ROCm software `_ -page provides reference throughput and latency measurements for inferencing popular AI models. +page provides reference throughput and serving measurements for inferencing popular AI models. .. important:: @@ -176,7 +170,7 @@ system's configuration. Once the setup is complete, choose between two options to reproduce the benchmark results: - .. _vllm-benchmark-mad: + .. _vllm-benchmark-mad-812: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -209,12 +203,15 @@ system's configuration. --timeout 28800 MAD launches a Docker container with the name - ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the - model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``. + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. - Although the :ref:`available models ` are preconfigured - to collect latency and throughput performance data, you can also change the benchmarking - parameters. See the standalone benchmarking tab for more information. + Although the :ref:`available models + ` are preconfigured to collect + offline throughput and online serving performance data, you can + also change the benchmarking parameters. See the standalone + benchmarking tab for more information. {% if model.tunableop %} @@ -224,14 +221,12 @@ system's configuration. TunableOp automatically explores different implementations and configurations of certain PyTorch operators to find the fastest one for your hardware. - By default, ``{{model.mad_tag}}`` runs with TunableOp disabled - (see - ``__). - To enable it, include the ``--tunableop on`` argument in your - run. + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see + ``__). To enable it, include + the ``--tunableop on`` argument in your run. - Enabling TunableOp triggers a two-pass run -- a warm-up followed - by the performance-collection run. + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the + performance-collection run. {% endif %} @@ -269,6 +264,13 @@ system's configuration. 3. To start the benchmark, use the following command with the appropriate options. + .. code-block:: + + ./run.sh \ + --config $CONFIG_CSV \ + --model_repo {{ model.model_repo }} \ + + .. dropdown:: Benchmark options :open: @@ -280,42 +282,40 @@ system's configuration. - Options - Description - * - ``$test_option`` - - latency - - Measure decoding token latency + * - ``--config`` + - ``configs/default.csv`` + - Run configs from the CSV for the chosen model repo and benchmark. * - - - throughput - - Measure token generation throughput + - ``configs/extended.csv`` + - * - - - all - - Measure both throughput and latency + - ``configs/performance.csv`` + - - * - ``$num_gpu`` - - 1 or 8 - - Number of GPUs + * - ``--benchmark`` + - ``throughput`` + - Measure offline end-to-end throughput. - * - ``$datatype`` - - ``float16`` or ``float8`` - - Data type + * - + - ``serving`` + - Measure online serving performance. + + * - + - ``all`` + - Measure both throughput and serving. + + * - `` + - See `run.sh `__ for more info. + - Additional overrides to the config CSV. The input sequence length, output sequence length, and tensor parallel (TP) are already configured. You don't need to specify them with this script. - Command: - - .. code-block:: - - ./vllm_benchmark_report.sh \ - -s $test_option \ - -m {{model.model_repo}} \ - -g $num_gpu \ - -d {{model.precision}} - .. note:: - For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``. + For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``. If you encounter the following error, pass your access-authorized Hugging Face token to the gated models. @@ -331,33 +331,33 @@ system's configuration. Here are some examples of running the benchmark with various options: - * Latency benchmark - - Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. - - .. code-block:: - - ./vllm_benchmark_report.sh \ - -s latency \ - -m {{model.model_repo}} \ - -g 8 \ - -d {{model.precision}} - - Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``. - * Throughput benchmark Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. .. code-block:: shell - ./vllm_benchmark_report.sh \ - -s throughput \ - -m {{model.model_repo}} \ - -g 8 \ - -d {{model.precision}} + export MAD_MODEL_NAME={{ model.mad_tag }} + ./run.sh \ + --config configs/default.csv \ + --model_repo {{model.model_repo}} \ + --benchmark throughput - Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``. + Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``. + + * Serving benchmark + + Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. + + .. code-block:: + + export MAD_MODEL_NAME={{ model.mad_tag }} + ./run.sh \ + --config configs/default.csv \ + --model_repo {{model.model_repo}} \ + --benchmark serving + + Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``. .. raw:: html @@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps: .. code-block:: shell cd vllm - git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f + git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. @@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps: docker build -f docker/Dockerfile.rocm -t vllm-rocm . -Known issues and workarounds -============================ - -AITER does not support FP8 KV cache yet. - Further reading ===============