From 55f95adc7cb09884382a230f67ea3255c2e1b2b2 Mon Sep 17 00:00:00 2001 From: yugang-amd Date: Fri, 20 Jun 2025 08:41:37 -0400 Subject: [PATCH] Update for vllm -06/10 (#4943) --- .../vllm_0.8.5_20250521-benchmark-models.yaml | 167 +++++++++ .../inference/vllm-benchmark-models.yaml | 13 +- .../previous-versions/vllm-0.8.5-20250521.rst | 341 ++++++++++++++++++ .../inference/benchmark-docker/vllm.rst | 7 + 4 files changed, 519 insertions(+), 9 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml new file mode 100644 index 000000000..8746859d6 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml @@ -0,0 +1,167 @@ +vllm_benchmark: + unified_docker: + latest: + pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11 + rocm_version: 6.3.1 + vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631) + pytorch_version: 2.7.0+gitf717b2a + hipblaslt_version: 0.15 + model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + - model: Llama 3.1 70B + mad_tag: pyt_vllm_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: float16 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + - model: Llama 3.2 11B Vision + mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct + model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct + url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct + precision: float16 + - model: Llama 2 7B + mad_tag: pyt_vllm_llama-2-7b + model_repo: meta-llama/Llama-2-7b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf + precision: float16 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 70B FP8 + mad_tag: pyt_vllm_llama-3.1-70b_fp8 + model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + - model: Mistral 7B + mad_tag: pyt_vllm_mistral-7b + model_repo: mistralai/Mistral-7B-Instruct-v0.3 + url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 + precision: float16 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + - model: Mistral 7B FP8 + mad_tag: pyt_vllm_mistral-7b_fp8 + model_repo: amd/Mistral-7B-v0.1-FP8-KV + url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV + precision: float8 + - group: Qwen + tag: qwen + models: + - model: Qwen2 7B + mad_tag: pyt_vllm_qwen2-7b + model_repo: Qwen/Qwen2-7B-Instruct + url: https://huggingface.co/Qwen/Qwen2-7B-Instruct + precision: float16 + - model: Qwen2 72B + mad_tag: pyt_vllm_qwen2-72b + model_repo: Qwen/Qwen2-72B-Instruct + url: https://huggingface.co/Qwen/Qwen2-72B-Instruct + precision: float16 + - model: QwQ-32B + mad_tag: pyt_vllm_qwq-32b + model_repo: Qwen/QwQ-32B + url: https://huggingface.co/Qwen/QwQ-32B + precision: float16 + tunableop: true + - group: Databricks DBRX + tag: dbrx + models: + - model: DBRX Instruct + mad_tag: pyt_vllm_dbrx-instruct + model_repo: databricks/dbrx-instruct + url: https://huggingface.co/databricks/dbrx-instruct + precision: float16 + - model: DBRX Instruct FP8 + mad_tag: pyt_vllm_dbrx_fp8 + model_repo: amd/dbrx-instruct-FP8-KV + url: https://huggingface.co/amd/dbrx-instruct-FP8-KV + precision: float8 + - group: Google Gemma + tag: gemma + models: + - model: Gemma 2 27B + mad_tag: pyt_vllm_gemma-2-27b + model_repo: google/gemma-2-27b + url: https://huggingface.co/google/gemma-2-27b + precision: float16 + - group: Cohere + tag: cohere + models: + - model: C4AI Command R+ 08-2024 + mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024 + model_repo: CohereForAI/c4ai-command-r-plus-08-2024 + url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024 + precision: float16 + - model: C4AI Command R+ 08-2024 FP8 + mad_tag: pyt_vllm_command-r-plus_fp8 + model_repo: amd/c4ai-command-r-plus-FP8-KV + url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV + precision: float8 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek MoE 16B + mad_tag: pyt_vllm_deepseek-moe-16b-chat + model_repo: deepseek-ai/deepseek-moe-16b-chat + url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat + precision: float16 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + - group: TII Falcon + tag: falcon + models: + - model: Falcon 180B + mad_tag: pyt_vllm_falcon-180b + model_repo: tiiuae/falcon-180B + url: https://huggingface.co/tiiuae/falcon-180B + precision: float16 diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml index 8746859d6..b4b0a2a41 100644 --- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -1,10 +1,10 @@ vllm_benchmark: unified_docker: latest: - pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521 - docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11 - rocm_version: 6.3.1 - vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631) + pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c + rocm_version: 6.4.1 + vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641) pytorch_version: 2.7.0+gitf717b2a hipblaslt_version: 0.15 model_groups: @@ -26,11 +26,6 @@ vllm_benchmark: model_repo: meta-llama/Llama-3.1-405B-Instruct url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct precision: float16 - - model: Llama 3.2 11B Vision - mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct - model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct - url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct - precision: float16 - model: Llama 2 7B mad_tag: pyt_vllm_llama-2-7b model_repo: meta-llama/Llama-2-7b-chat-hf diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst new file mode 100644 index 000000000..0f719b7a9 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst @@ -0,0 +1,341 @@ +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the + ROCm vLLM Docker image. + :keywords: model, MAD, automation, dashboarding, validate + +********************************** +vLLM inference performance testing +********************************** + +.. _vllm-benchmark-unified-docker: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers + a prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM + Docker image integrates vLLM and PyTorch tailored specifically for MI300X series + accelerators and includes the following components: + + * `ROCm {{ unified_docker.rocm_version }} `_ + + * `vLLM {{ unified_docker.vllm_version }} `_ + + * `PyTorch {{ unified_docker.pytorch_version }} `_ + + * `hipBLASLt {{ unified_docker.hipblaslt_version }} `_ + + With this Docker image, you can quickly test the :ref:`expected + inference performance numbers ` for + MI300X series accelerators. + + .. _vllm-benchmark-available-models: + + Supported models + ================ + + The following models are supported for inference performance benchmarking + with vLLM and ROCm. Some instructions, commands, and recommendations in this + documentation might vary by model -- select one to get started. + + .. raw:: html + +
+
+
Model group
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. _vllm-benchmark-vllm: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization prior to use via an external license agreement through a third party. + + {% endfor %} + {% endfor %} + + .. note:: + + vLLM is a toolkit and library for LLM inference and serving. AMD implements + high-performance custom kernels and modules in vLLM to enhance performance. + See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + more information. + + .. _vllm-benchmark-performance-measurements: + + Performance measurements + ======================== + + To evaluate performance, the + `Performance results with AMD ROCm software `_ + page provides reference throughput and latency measurements for inferencing + popular AI models. + + .. note:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + should not be interpreted as the peak performance achievable by AMD + Instinct MI325X and MI300X accelerators or ROCm software. + + Advanced features and known issues + ================================== + + For information on experimental features and known issues related to ROCm optimization efforts on vLLM, + see the developer's guide at ``__. + + System validation + ================= + + Before running AI workloads, it's important to validate that your AMD hardware is configured + correctly and performing optimally. + + To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU + might hang until the periodic balancing is finalized. For more information, + see the :ref:`system validation steps `. + + .. code-block:: shell + + # disable automatic NUMA balancing + sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' + # check if NUMA balancing is disabled (returns 0 if disabled) + cat /proc/sys/kernel/numa_balancing + 0 + + To test for optimal performance, consult the recommended :ref:`System health benchmarks + `. This suite of tests will help you verify and fine-tune your + system's configuration. + + Pull the Docker image + ===================== + + Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_. + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + + Benchmarking + ============ + + Once the setup is complete, choose between two options to reproduce the + benchmark results: + + .. _vllm-benchmark-mad: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model + using one GPU with the ``{{model.precision}}`` data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the + model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``. + + Although the :ref:`available models ` are preconfigured + to collect latency and throughput performance data, you can also change the benchmarking + parameters. See the standalone benchmarking tab for more information. + + {% if model.tunableop %} + + .. note:: + + For improved performance, consider enabling :ref:`PyTorch TunableOp `. + TunableOp automatically explores different implementations and configurations of certain PyTorch + operators to find the fastest one for your hardware. + + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled + (see + ``__). To + enable it, edit the default run behavior in the ``models.json`` + configuration before running inference -- update the model's run + ``args`` by changing ``--tunableop off`` to ``--tunableop on``. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run. + + {% endif %} + + .. tab-item:: Standalone benchmarking + + Run the vLLM benchmark tool independently by starting the + `Docker container <{{ unified_docker.docker_hub_url }}>`_ + as shown in the following snippet. + + .. code-block:: + + docker pull {{ unified_docker.pull_tag }} + docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }} + + In the Docker container, clone the ROCm MAD repository and navigate to the + benchmark scripts directory at ``~/MAD/scripts/vllm``. + + .. code-block:: + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/vllm + + To start the benchmark, use the following command with the appropriate options. + + .. code-block:: + + ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}} + + .. list-table:: + :header-rows: 1 + :align: center + + * - Name + - Options + - Description + + * - ``$test_option`` + - latency + - Measure decoding token latency + + * - + - throughput + - Measure token generation throughput + + * - + - all + - Measure both throughput and latency + + * - ``$num_gpu`` + - 1 or 8 + - Number of GPUs + + * - ``$datatype`` + - ``float16`` or ``float8`` + - Data type + + .. note:: + + The input sequence length, output sequence length, and tensor parallel (TP) are + already configured. You don't need to specify them with this script. + + .. note:: + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. + + .. code-block:: + + OSError: You are trying to access a gated repo. + + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token + + Here are some examples of running the benchmark with various options. + + * Latency benchmark + + Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision. + + .. code-block:: + + ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}} + + Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``. + + * Throughput benchmark + + Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision. + + .. code-block:: shell + + ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}} + + Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``. + + .. raw:: html + + + + .. note:: + + Throughput is calculated as: + + - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time + + - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time + {% endfor %} + {% endfor %} + +Further reading +=============== + +- To learn more about the options for latency and throughput benchmark scripts, + see ``_. + +- To learn more about system settings and management practices to configure your system for + MI300X accelerators, see `AMD Instinct MI300X system optimization `_ + +- For application performance optimization strategies for HPC and AI workloads, + including inference with vLLM, see :doc:`../../inference-optimization/workload`. + +- To learn how to run LLM models from Hugging Face or your own model, see + :doc:`Running models from Hugging Face <../hugging-face-models>`. + +- To learn how to optimize inference on LLMs, see + :doc:`Inference optimization <../../inference-optimization/index>`. + +- To learn how to fine-tune LLMs, see + :doc:`Fine-tuning LLMs <../../fine-tuning/index>`. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 6ac59e4f7..bfae70c4c 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -356,6 +356,13 @@ for benchmarking, see the version-specific documentation. - PyTorch version - Resources + * - 6.3.1 + - 0.8.5 (0.8.6.dev315+g91a560098.rocm631) + - 2.7.0 + - + * :doc:`Documentation ` + * `Docker Hub `_ + * - 6.3.1 - 0.8.5 - 2.7.0