Update vLLM inference benchmark doc for 0909 release (and Sphinx fixes) (#5289)

This commit is contained in:
Peter Park
2025-09-11 15:01:17 -04:00
committed by GitHub
parent 10f6086819
commit 7098bdc03b
17 changed files with 1041 additions and 434 deletions

View File

@@ -673,6 +673,7 @@ github
globals
gnupg
grayscale
gx
gzip
heterogenous
hipBLAS
@@ -783,6 +784,7 @@ parallelizing
param
parameterization
passthrough
pe
perfcounter
performant
perl
@@ -812,6 +814,7 @@ profiler
profilers
protobuf
pseudorandom
px
py
pytorch
recommender

View File

@@ -0,0 +1,91 @@
vllm_benchmark:
unified_docker:
latest:
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
rocm_version: 6.4.1
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
pytorch_version: 2.7.0+gitf717b2a
hipblaslt_version: 0.15
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: pyt_vllm_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: float16
- model: Llama 3.1 70B
mad_tag: pyt_vllm_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: float16
- model: Llama 3.1 405B
mad_tag: pyt_vllm_llama-3.1-405b
model_repo: meta-llama/Llama-3.1-405B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
precision: float16
- model: Llama 2 70B
mad_tag: pyt_vllm_llama-2-70b
model_repo: meta-llama/Llama-2-70b-chat-hf
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
precision: float16
- model: Llama 3.1 8B FP8
mad_tag: pyt_vllm_llama-3.1-8b_fp8
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
precision: float8
- model: Llama 3.1 70B FP8
mad_tag: pyt_vllm_llama-3.1-70b_fp8
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
precision: float8
- model: Llama 3.1 405B FP8
mad_tag: pyt_vllm_llama-3.1-405b_fp8
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
precision: float8
- group: Mistral AI
tag: mistral
models:
- model: Mixtral MoE 8x7B
mad_tag: pyt_vllm_mixtral-8x7b
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
precision: float16
- model: Mixtral MoE 8x22B
mad_tag: pyt_vllm_mixtral-8x22b
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
precision: float16
- model: Mixtral MoE 8x7B FP8
mad_tag: pyt_vllm_mixtral-8x7b_fp8
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
precision: float8
- model: Mixtral MoE 8x22B FP8
mad_tag: pyt_vllm_mixtral-8x22b_fp8
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
precision: float8
- group: Qwen
tag: qwen
models:
- model: QwQ-32B
mad_tag: pyt_vllm_qwq-32b
model_repo: Qwen/QwQ-32B
url: https://huggingface.co/Qwen/QwQ-32B
precision: float16
- model: Qwen3 30B A3B
mad_tag: pyt_vllm_qwen3-30b-a3b
model_repo: Qwen/Qwen3-30B-A3B
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
precision: float16
- group: Microsoft Phi
tag: phi
models:
- model: Phi-4
mad_tag: pyt_vllm_phi-4
model_repo: microsoft/phi-4
url: https://huggingface.co/microsoft/phi-4

View File

@@ -1,12 +1,11 @@
sglang_benchmark:
unified_docker:
latest:
pull_tag: lmsysorg/sglang:v0.4.5-rocm630
dockers:
- pull_tag: lmsysorg/sglang:v0.4.5-rocm630
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
rocm_version: 6.3.0
sglang_version: 0.4.5 (0.4.5-rocm)
pytorch_version: 2.6.0a0+git8d4926e
model_groups:
components:
ROCm: 6.3.0
SGLang: 0.4.5 (0.4.5-rocm)
PyTorch: 2.6.0a0+git8d4926e
model_groups:
- group: DeepSeek
tag: deepseek
models:

View File

@@ -1,14 +1,12 @@
vllm_benchmark:
unified_docker:
latest:
# TODO: update me
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
rocm_version: 6.4.1
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
hipblaslt_version: 0.15
model_groups:
dockers:
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
components:
ROCm: 6.4.1
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
PyTorch: 2.7.0+gitf717b2a
hipBLASLt: 0.15
model_groups:
- group: Meta Llama
tag: llama
models:
@@ -17,36 +15,85 @@ vllm_benchmark:
model_repo: meta-llama/Llama-3.1-8B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: float16
config:
tp: 1
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Llama 3.1 70B
mad_tag: pyt_vllm_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: float16
config:
tp: 8
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Llama 3.1 405B
mad_tag: pyt_vllm_llama-3.1-405b
model_repo: meta-llama/Llama-3.1-405B-Instruct
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
precision: float16
config:
tp: 8
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Llama 2 70B
mad_tag: pyt_vllm_llama-2-70b
model_repo: meta-llama/Llama-2-70b-chat-hf
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
precision: float16
config:
tp: 8
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 4096
max_num_batched_tokens: 4096
max_model_len: 4096
- model: Llama 3.1 8B FP8
mad_tag: pyt_vllm_llama-3.1-8b_fp8
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
precision: float8
config:
tp: 1
dtype: auto
kv_cache_dtype: fp8
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Llama 3.1 70B FP8
mad_tag: pyt_vllm_llama-3.1-70b_fp8
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
precision: float8
config:
tp: 8
dtype: auto
kv_cache_dtype: fp8
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Llama 3.1 405B FP8
mad_tag: pyt_vllm_llama-3.1-405b_fp8
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
precision: float8
config:
tp: 8
dtype: auto
kv_cache_dtype: fp8
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- group: Mistral AI
tag: mistral
models:
@@ -55,21 +102,49 @@ vllm_benchmark:
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
precision: float16
config:
tp: 8
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 32768
max_num_batched_tokens: 32768
max_model_len: 8192
- model: Mixtral MoE 8x22B
mad_tag: pyt_vllm_mixtral-8x22b
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
precision: float16
config:
tp: 8
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 65536
max_num_batched_tokens: 65536
max_model_len: 8192
- model: Mixtral MoE 8x7B FP8
mad_tag: pyt_vllm_mixtral-8x7b_fp8
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
precision: float8
config:
tp: 8
dtype: auto
kv_cache_dtype: fp8
max_seq_len_to_capture: 32768
max_num_batched_tokens: 32768
max_model_len: 8192
- model: Mixtral MoE 8x22B FP8
mad_tag: pyt_vllm_mixtral-8x22b_fp8
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
precision: float8
config:
tp: 8
dtype: auto
kv_cache_dtype: fp8
max_seq_len_to_capture: 65536
max_num_batched_tokens: 65536
max_model_len: 8192
- group: Qwen
tag: qwen
models:
@@ -78,11 +153,25 @@ vllm_benchmark:
model_repo: Qwen/QwQ-32B
url: https://huggingface.co/Qwen/QwQ-32B
precision: float16
config:
tp: 1
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 131072
max_num_batched_tokens: 131072
max_model_len: 8192
- model: Qwen3 30B A3B
mad_tag: pyt_vllm_qwen3-30b-a3b
model_repo: Qwen/Qwen3-30B-A3B
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
precision: float16
config:
tp: 1
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 32768
max_num_batched_tokens: 32768
max_model_len: 8192
- group: Microsoft Phi
tag: phi
models:
@@ -90,3 +179,10 @@ vllm_benchmark:
mad_tag: pyt_vllm_phi-4
model_repo: microsoft/phi-4
url: https://huggingface.co/microsoft/phi-4
config:
tp: 1
dtype: auto
kv_cache_dtype: auto
max_seq_len_to_capture: 16384
max_num_batched_tokens: 16384
max_model_len: 8192

View File

@@ -0,0 +1,445 @@
:orphan:
.. meta::
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
ROCm vLLM Docker image.
:keywords: model, MAD, automation, dashboarding, validate
**********************************
vLLM inference performance testing
**********************************
.. caution::
This documentation does not reflect the latest version of ROCm vLLM
inference performance documentation. See :doc:`../vllm` for the latest version.
.. _vllm-benchmark-unified-docker-812:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
a prebuilt, optimized environment for validating large language model (LLM)
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
accelerators and includes the following components:
.. list-table::
:header-rows: 1
* - Software component
- Version
* - `ROCm <https://github.com/ROCm/ROCm>`__
- {{ unified_docker.rocm_version }}
* - `vLLM <https://docs.vllm.ai/en/latest>`__
- {{ unified_docker.vllm_version }}
* - `PyTorch <https://github.com/ROCm/pytorch>`__
- {{ unified_docker.pytorch_version }}
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
- {{ unified_docker.hipblaslt_version }}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
MI300X series accelerators.
What's new
==========
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
* Upgraded to vLLM v0.10.
* FP8 KV cache support via AITER.
* Full graph capture support via AITER.
Supported models
================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
.. _vllm-benchmark-available-models-812:
The following models are supported for inference performance benchmarking
with vLLM and ROCm. Some instructions, commands, and recommendations in this
documentation might vary by model -- select one to get started.
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model group</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. _vllm-benchmark-vllm-812:
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. note::
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
Some models require access authorization prior to use via an external license agreement through a third party.
{% endfor %}
{% endfor %}
.. note::
vLLM is a toolkit and library for LLM inference and serving. AMD implements
high-performance custom kernels and modules in vLLM to enhance performance.
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
more information.
.. _vllm-benchmark-performance-measurements-812:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
page provides reference throughput and serving measurements for inferencing popular AI models.
.. important::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
only reflects the latest version of this inference benchmarking environment.
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
Pull the Docker image
=====================
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
Benchmarking
============
Once the setup is complete, choose between two options to reproduce the
benchmark results:
.. _vllm-benchmark-mad-812:
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{model.mad_tag}} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
and ``{{ model.mad_tag }}_serving.csv``.
Although the :ref:`available models
<vllm-benchmark-available-models-812>` are preconfigured to collect
offline throughput and online serving performance data, you can
also change the benchmarking parameters. See the standalone
benchmarking tab for more information.
{% if model.tunableop %}
.. note::
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
TunableOp automatically explores different implementations and configurations of certain PyTorch
operators to find the fastest one for your hardware.
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
the ``--tunableop on`` argument in your run.
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
performance-collection run.
{% endif %}
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required scripts
1. Run the vLLM benchmark tool independently by starting the
`Docker container <{{ unified_docker.docker_hub_url }}>`_
as shown in the following snippet.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
docker run -it \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--shm-size 16G \
--security-opt seccomp=unconfined \
--security-opt apparmor=unconfined \
--cap-add=SYS_PTRACE \
-v $(pwd):/workspace \
--env HUGGINGFACE_HUB_CACHE=/workspace \
--name test \
{{ unified_docker.pull_tag }}
2. In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``~/MAD/scripts/vllm``.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/vllm
3. To start the benchmark, use the following command with the appropriate options.
.. code-block::
./run.sh \
--config $CONFIG_CSV \
--model_repo {{ model.model_repo }} \
<overrides>
.. dropdown:: Benchmark options
:open:
.. list-table::
:header-rows: 1
:align: center
* - Name
- Options
- Description
* - ``--config``
- ``configs/default.csv``
- Run configs from the CSV for the chosen model repo and benchmark.
* -
- ``configs/extended.csv``
-
* -
- ``configs/performance.csv``
-
* - ``--benchmark``
- ``throughput``
- Measure offline end-to-end throughput.
* -
- ``serving``
- Measure online serving performance.
* -
- ``all``
- Measure both throughput and serving.
* - `<overrides>`
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
- Additional overrides to the config CSV.
The input sequence length, output sequence length, and tensor parallel (TP) are
already configured. You don't need to specify them with this script.
.. note::
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
If you encounter the following error, pass your access-authorized Hugging
Face token to the gated models.
.. code-block::
OSError: You are trying to access a gated repo.
# pass your HF_TOKEN
export HF_TOKEN=$your_personal_hf_token
.. rubric:: Benchmarking examples
Here are some examples of running the benchmark with various options:
* Throughput benchmark
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
.. code-block:: shell
export MAD_MODEL_NAME={{ model.mad_tag }}
./run.sh \
--config configs/default.csv \
--model_repo {{model.model_repo}} \
--benchmark throughput
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
* Serving benchmark
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
.. code-block::
export MAD_MODEL_NAME={{ model.mad_tag }}
./run.sh \
--config configs/default.csv \
--model_repo {{model.model_repo}} \
--benchmark serving
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
.. raw:: html
<style>
mjx-container[jax="CHTML"][display="true"] {
text-align: left;
margin: 0;
}
</style>
.. note::
Throughput is calculated as:
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
{% endfor %}
{% endfor %}
Advanced usage
==============
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
Reproducing the Docker image
----------------------------
To reproduce this ROCm/vLLM Docker image release, follow these steps:
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
.. code-block:: shell
git clone https://github.com/ROCm/vllm.git
2. Checkout the specific release commit.
.. code-block:: shell
cd vllm
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
.. code-block:: shell
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
Further reading
===============
- To learn more about the options for latency and throughput benchmark scripts,
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
- For application performance optimization strategies for HPC and AI workloads,
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
- To learn how to run community models from Hugging Face on AMD GPUs, see
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
- To learn how to fine-tune LLMs and optimize inference, see
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================
See :doc:`vllm-history` to find documentation for previous releases
of the ``ROCm/vllm`` Docker image.

View File

@@ -16,7 +16,7 @@ vLLM inference performance testing
.. _vllm-benchmark-unified-docker-715:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
Supported models
================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}

View File

@@ -7,7 +7,7 @@ vLLM inference performance testing version history
This table lists previous versions of the ROCm vLLM inference Docker image for
inference performance testing. For detailed information about available models
for benchmarking, see the version-specific documentation. You can find tagged
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.
.. list-table::
:header-rows: 1

View File

@@ -31,22 +31,26 @@ PyTorch inference performance testing
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1" style="display: none;">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0 pt-1" style="display: none;">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
<div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% if models|length % 3 == 0 %}
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>

View File

@@ -2,19 +2,19 @@
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
:keywords: model, MAD, automation, dashboarding, validate
************************************
SGLang inference performance testing
************************************
*****************************************************************
SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
*****************************************************************
.. _sglang-benchmark-unified-docker:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
{% set docker = data.dockers[0] %}
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
serving engine for large language models (LLMs) and vision models. The
ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
accelerators. It includes the following software components:
@@ -24,14 +24,10 @@ SGLang inference performance testing
* - Software component
- Version
* - `ROCm <https://github.com/ROCm/ROCm>`__
- {{ unified_docker.rocm_version }}
* - `SGLang <https://docs.sglang.ai/index.html>`__
- {{ unified_docker.sglang_version }}
* - `PyTorch <https://github.com/pytorch/pytorch>`__
- {{ unified_docker.pytorch_version }}
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
System validation
=================
@@ -50,8 +46,8 @@ system's configuration.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
{% set model_groups = data.sglang_benchmark.model_groups %}
{% set unified_docker = data.dockers[0] %}
{% set model_groups = data.model_groups %}
Pull the Docker image
=====================

View File

@@ -7,14 +7,13 @@
vLLM inference performance testing
**********************************
.. _vllm-benchmark-unified-docker-812:
.. _vllm-benchmark-unified-docker-909:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
{% set docker = data.dockers[0] %}
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
a prebuilt, optimized environment for validating large language model (LLM)
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
@@ -26,20 +25,13 @@ vLLM inference performance testing
* - Software component
- Version
* - `ROCm <https://github.com/ROCm/ROCm>`__
- {{ unified_docker.rocm_version }}
* - `vLLM <https://docs.vllm.ai/en/latest>`__
- {{ unified_docker.vllm_version }}
* - `PyTorch <https://github.com/ROCm/pytorch>`__
- {{ unified_docker.pytorch_version }}
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
- {{ unified_docker.hipblaslt_version }}
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
inference performance numbers <vllm-benchmark-performance-measurements-909>` for
MI300X series accelerators.
What's new
@@ -47,21 +39,23 @@ What's new
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
* Upgraded to vLLM v0.10.
* Upgraded to vLLM v0.10.1.
* FP8 KV cache support via AITER.
* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
* Full graph capture support via AITER.
* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
.. _vllm-benchmark-supported-models-909:
Supported models
================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
{% set docker = data.dockers[0] %}
{% set model_groups = data.model_groups %}
.. _vllm-benchmark-available-models-812:
.. _vllm-benchmark-available-models-909:
The following models are supported for inference performance benchmarking
with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -70,25 +64,25 @@ Supported models
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model group</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
@@ -96,29 +90,25 @@ Supported models
</div>
</div>
.. _vllm-benchmark-vllm-812:
.. _vllm-benchmark-vllm-909:
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. container:: model-doc {{ model.mad_tag }}
.. note::
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
Some models require access authorization prior to use via an external license agreement through a third party.
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
{% endif %}
{% endfor %}
{% endfor %}
.. note::
vLLM is a toolkit and library for LLM inference and serving. AMD implements
high-performance custom kernels and modules in vLLM to enhance performance.
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
more information.
.. _vllm-benchmark-performance-measurements-812:
.. _vllm-benchmark-performance-measurements-909:
Performance measurements
========================
@@ -151,18 +141,18 @@ system's configuration.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
{% set model_groups = data.vllm_benchmark.model_groups %}
{% set docker = data.dockers[0] %}
{% set model_groups = data.model_groups %}
Pull the Docker image
=====================
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
docker pull {{ docker.pull_tag }}
Benchmarking
============
@@ -170,7 +160,7 @@ system's configuration.
Once the setup is complete, choose between two options to reproduce the
benchmark results:
.. _vllm-benchmark-mad-812:
.. _vllm-benchmark-mad-909:
{% for model_group in model_groups %}
{% for model in model_group.models %}
@@ -181,6 +171,9 @@ system's configuration.
.. tab-item:: MAD-integrated benchmarking
The following run command is tailored to {{ model.model }}.
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
@@ -208,7 +201,7 @@ system's configuration.
and ``{{ model.mad_tag }}_serving.csv``.
Although the :ref:`available models
<vllm-benchmark-available-models-812>` are preconfigured to collect
<vllm-benchmark-available-models-909>` are preconfigured to collect
offline throughput and online serving performance data, you can
also change the benchmarking parameters. See the standalone
benchmarking tab for more information.
@@ -232,15 +225,27 @@ system's configuration.
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required scripts
The following commands are optimized for {{ model.model }}.
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
1. Run the vLLM benchmark tool independently by starting the
`Docker container <{{ unified_docker.docker_hub_url }}>`_
as shown in the following snippet.
.. seealso::
For more information on configuration, see the `config files
<https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
for descriptions of available configuration options
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
additional benchmarking information.
.. rubric:: Launch the container
You can run the vLLM benchmark tool independently by starting the
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
in the following snippet.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
docker pull {{ docker.pull_tag }}
docker run -it \
--device=/dev/kfd \
--device=/dev/dri \
@@ -252,71 +257,102 @@ system's configuration.
-v $(pwd):/workspace \
--env HUGGINGFACE_HUB_CACHE=/workspace \
--name test \
{{ unified_docker.pull_tag }}
{{ docker.pull_tag }}
2. In the Docker container, clone the ROCm MAD repository and navigate to the
benchmark scripts directory at ``~/MAD/scripts/vllm``.
.. rubric:: Throughput command
Use the following command to start the throughput benchmark.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/vllm
model={{ model.model_repo }}
tp={{ model.config.tp }}
num_prompts=1024
in=128
out=128
dtype={{ model.config.dtype }}
kv_cache_dtype={{ model.config.kv_cache_dtype }}
max_num_seqs=1024
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
max_model_len={{ model.config.max_model_len }}
3. To start the benchmark, use the following command with the appropriate options.
vllm bench throughput --model $model \
-tp $tp \
--num-prompts $num_prompts \
--input-len $in \
--output-len $out \
--dtype $dtype \
--kv-cache-dtype $kv_cache_dtype \
--max-num-seqs $max_num_seqs \
--max-seq-len-to-capture $max_seq_len_to_capture \
--max-num-batched-tokens $max_num_batched_tokens \
--max-model-len $max_model_len \
--trust-remote-code \
--output-json ${model}_throughput.json \
--gpu-memory-utilization 0.9
.. code-block::
.. rubric:: Serving command
./run.sh \
--config $CONFIG_CSV \
--model_repo {{ model.model_repo }} \
<overrides>
1. Start the server using the following command:
.. dropdown:: Benchmark options
:open:
.. code-block:: shell
.. list-table::
:header-rows: 1
:align: center
model={{ model.model_repo }}
tp={{ model.config.tp }}
dtype={{ model.config.dtype }}
kv_cache_dtype={{ model.config.kv_cache_dtype }}
max_num_seqs=256
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
max_model_len={{ model.config.max_model_len }}
* - Name
- Options
- Description
vllm serve $model \
-tp $tp \
--dtype $dtype \
--kv-cache-dtype $kv_cache_dtype \
--max-num-seqs $max_num_seqs \
--max-seq-len-to-capture $max_seq_len_to_capture \
--max-num-batched-tokens $max_num_batched_tokens \
--max-model-len $max_model_len \
--no-enable-prefix-caching \
--swap-space 16 \
--disable-log-requests \
--trust-remote-code \
--gpu-memory-utilization 0.9
* - ``--config``
- ``configs/default.csv``
- Run configs from the CSV for the chosen model repo and benchmark.
Wait until the model has loaded and the server is ready to accept requests.
* -
- ``configs/extended.csv``
-
2. On another terminal on the same machine, run the benchmark:
* -
- ``configs/performance.csv``
-
.. code-block:: shell
* - ``--benchmark``
- ``throughput``
- Measure offline end-to-end throughput.
# Connect to the container
docker exec -it test bash
* -
- ``serving``
- Measure online serving performance.
# Wait for the server to start
until curl -s http://localhost:8000/v1/models; do sleep 30; done
* -
- ``all``
- Measure both throughput and serving.
* - `<overrides>`
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
- Additional overrides to the config CSV.
The input sequence length, output sequence length, and tensor parallel (TP) are
already configured. You don't need to specify them with this script.
# Run the benchmark
model={{ model.model_repo }}
max_concurrency=1
num_prompts=10
in=128
out=128
vllm bench serve --model $model \
--percentile-metrics "ttft,tpot,itl,e2el" \
--dataset-name random \
--ignore-eos \
--max-concurrency $max_concurrency \
--num-prompts $num_prompts \
--random-input-len $in \
--random-output-len $out \
--trust-remote-code \
--save-result \
--result-filename ${model}_serving.json
.. note::
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
If you encounter the following error, pass your access-authorized Hugging
Face token to the gated models.
@@ -327,38 +363,6 @@ system's configuration.
# pass your HF_TOKEN
export HF_TOKEN=$your_personal_hf_token
.. rubric:: Benchmarking examples
Here are some examples of running the benchmark with various options:
* Throughput benchmark
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
.. code-block:: shell
export MAD_MODEL_NAME={{ model.mad_tag }}
./run.sh \
--config configs/default.csv \
--model_repo {{model.model_repo}} \
--benchmark throughput
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
* Serving benchmark
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
.. code-block::
export MAD_MODEL_NAME={{ model.mad_tag }}
./run.sh \
--config configs/default.csv \
--model_repo {{model.model_repo}} \
--benchmark serving
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
.. raw:: html
<style>
@@ -382,7 +386,7 @@ Advanced usage
==============
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
Reproducing the Docker image
----------------------------
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
.. code-block:: shell
cd vllm
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
@@ -419,15 +423,12 @@ Further reading
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
a brief introduction to vLLM and optimization strategies.
- For application performance optimization strategies for HPC and AI workloads,
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
- To learn how to run community models from Hugging Face on AMD GPUs, see
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
- To learn how to fine-tune LLMs and optimize inference, see
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

View File

@@ -83,25 +83,25 @@ started.
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
@@ -208,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
e. RDMA interface
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
Then, set the RDMA interfaces to use for communication.
.. code-block:: bash

View File

@@ -71,25 +71,25 @@ workloads:
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}

View File

@@ -112,7 +112,7 @@ Multi-node setup
For multi-node environments, ensure you have all the necessary packages for
your network device, such as, RDMA. If you're not using a multi-node setup
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
1. Install the following packages to build and install the RDMA driver.
@@ -177,7 +177,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
e. RDMA interface
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
Then, set the RDMA interfaces to use for communication.
.. code-block:: bash
@@ -199,7 +199,7 @@ Pull the Docker image
docker pull rocm/jax-training:maxtext-v25.5
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
and execute the benchmark.
.. code-block:: shell

View File

@@ -56,25 +56,25 @@ vary by model -- select one to get started.
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}

View File

@@ -45,25 +45,25 @@ vary by model -- select one to get started.
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Model group</div>
<div class="row col-10">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}

View File

@@ -7,15 +7,14 @@ html {
--compat-head-color: var(--pst-color-surface);
--compat-param-hover-color: var(--pst-color-link-hover);
--compat-param-selected-color: var(--pst-color-primary);
--compat-border-color: var(--pst-color-border);
}
html[data-theme="light"] {
--compat-border-color: var(--pst-gray-500);
--compat-param-disabled-color: var(--pst-gray-300);
}
html[data-theme="dark"] {
--compat-border-color: var(--pst-gray-600);
--compat-param-disabled-color: var(--pst-gray-600);
}
@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
padding: 0 0 1rem 0;
}
div[data-param-k="model-group"],
div[data-param-k="model"] {
background-color: var(--compat-bg-color);
padding: 2px;
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
cursor: pointer;
}
div[data-param-k="model-group"][data-param-state="selected"],
div[data-param-k="model"][data-param-state="selected"] {
background-color: var(--compat-param-selected-color);
color: var(--compat-fg-color);
}
div[data-param-k="model"][data-param-state="latest-version"] {
background-color: var(--compat-param-selected-color);
color: var(--compat-fg-color);
}
div[data-param-k="model"][data-param-state="disabled"] {
background-color: var(--compat-param-disabled-color);
text-decoration: line-through;
/* text-decoration-color: var(--pst-color-danger); */
cursor: auto;
}
div[data-param-k="model"]:not([data-param-state]):hover {
div[data-param-k="model-group"]:hover,
div[data-param-k="model"]:hover {
background-color: var(--compat-param-hover-color);
}
div[data-param-k="model-group"] {
background-color: var(--compat-bg-color);
padding: 2px;
border: solid 1px var(--compat-border-color);
font-weight: 500;
cursor: pointer;
}
div[data-param-k="model-group"][data-param-state="selected"] {
background-color: var(--compat-param-selected-color);
color: var(--compat-fg-color);
}
/*
div[data-param-k="model-group"][data-param-state="latest-version"] {
background-color: var(--compat-param-selected-color);
color: var(--compat-fg-color);
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
div[data-param-k="model-group"][data-param-state="disabled"] {
background-color: var(--compat-param-disabled-color);
text-decoration: line-through;
/* text-decoration-color: var(--pst-color-danger); */
text-decoration-color: var(--pst-color-danger);
cursor: auto;
}
div[data-param-k="model-group"]:not([data-param-state]):hover {
background-color: var(--compat-param-hover-color);
}
*/
.model-param-head {
background-color: var(--compat-head-color);
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
/* margin: 2px; */
border-right: solid 2px var(--compat-accent-color);
border-right: solid 4px var(--compat-accent-color);
font-weight: 600;
}
.model-param {
/* padding: 2px; */
/* margin: 0 2px 0 2px; */
/* margin: 2px; */
border: solid 1px var(--compat-border-color);
font-weight: 500;
}