diff --git a/.wordlist.txt b/.wordlist.txt index 8cc6399b6..cf9f990d4 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -673,6 +673,7 @@ github globals gnupg grayscale +gx gzip heterogenous hipBLAS @@ -783,6 +784,7 @@ parallelizing param parameterization passthrough +pe perfcounter performant perl @@ -812,6 +814,7 @@ profiler profilers protobuf pseudorandom +px py pytorch recommender diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml new file mode 100644 index 000000000..418415319 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml @@ -0,0 +1,91 @@ +vllm_benchmark: + unified_docker: + latest: + pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa + rocm_version: 6.4.1 + vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641) + pytorch_version: 2.7.0+gitf717b2a + hipblaslt_version: 0.15 + model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + - model: Llama 3.1 70B + mad_tag: pyt_vllm_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: float16 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 70B FP8 + mad_tag: pyt_vllm_llama-3.1-70b_fp8 + model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV + precision: float8 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + - group: Qwen + tag: qwen + models: + - model: QwQ-32B + mad_tag: pyt_vllm_qwq-32b + model_repo: Qwen/QwQ-32B + url: https://huggingface.co/Qwen/QwQ-32B + precision: float16 + - model: Qwen3 30B A3B + mad_tag: pyt_vllm_qwen3-30b-a3b + model_repo: Qwen/Qwen3-30B-A3B + url: https://huggingface.co/Qwen/Qwen3-30B-A3B + precision: float16 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml similarity index 100% rename from docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml rename to docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml diff --git a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml index cc832dffb..8f80424d3 100644 --- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml @@ -1,17 +1,16 @@ -sglang_benchmark: - unified_docker: - latest: - pull_tag: lmsysorg/sglang:v0.4.5-rocm630 - docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951 - rocm_version: 6.3.0 - sglang_version: 0.4.5 (0.4.5-rocm) - pytorch_version: 2.6.0a0+git8d4926e - model_groups: - - group: DeepSeek - tag: deepseek - models: - - model: DeepSeek-R1-Distill-Qwen-32B - mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b - model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B - url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B - precision: bfloat16 +dockers: + - pull_tag: lmsysorg/sglang:v0.4.5-rocm630 + docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951 + components: + ROCm: 6.3.0 + SGLang: 0.4.5 (0.4.5-rocm) + PyTorch: 2.6.0a0+git8d4926e +model_groups: + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-R1-Distill-Qwen-32B + mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b + model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B + url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B + precision: bfloat16 diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml index a522e61a6..99d9b773b 100644 --- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -1,92 +1,188 @@ -vllm_benchmark: - unified_docker: - latest: - # TODO: update me - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812 - docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa - rocm_version: 6.4.1 - vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641) - pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a) - hipblaslt_version: 0.15 - model_groups: - - group: Meta Llama - tag: llama - models: - - model: Llama 3.1 8B - mad_tag: pyt_vllm_llama-3.1-8b - model_repo: meta-llama/Llama-3.1-8B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-8B - precision: float16 - - model: Llama 3.1 70B - mad_tag: pyt_vllm_llama-3.1-70b - model_repo: meta-llama/Llama-3.1-70B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct - precision: float16 - - model: Llama 3.1 405B - mad_tag: pyt_vllm_llama-3.1-405b - model_repo: meta-llama/Llama-3.1-405B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct - precision: float16 - - model: Llama 2 70B - mad_tag: pyt_vllm_llama-2-70b - model_repo: meta-llama/Llama-2-70b-chat-hf - url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf - precision: float16 - - model: Llama 3.1 8B FP8 - mad_tag: pyt_vllm_llama-3.1-8b_fp8 - model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV - precision: float8 - - model: Llama 3.1 70B FP8 - mad_tag: pyt_vllm_llama-3.1-70b_fp8 - model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV - precision: float8 - - model: Llama 3.1 405B FP8 - mad_tag: pyt_vllm_llama-3.1-405b_fp8 - model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV - precision: float8 - - group: Mistral AI - tag: mistral - models: - - model: Mixtral MoE 8x7B - mad_tag: pyt_vllm_mixtral-8x7b - model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 - url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 - precision: float16 - - model: Mixtral MoE 8x22B - mad_tag: pyt_vllm_mixtral-8x22b - model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 - url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 - precision: float16 - - model: Mixtral MoE 8x7B FP8 - mad_tag: pyt_vllm_mixtral-8x7b_fp8 - model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV - url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV - precision: float8 - - model: Mixtral MoE 8x22B FP8 - mad_tag: pyt_vllm_mixtral-8x22b_fp8 - model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV - url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV - precision: float8 - - group: Qwen - tag: qwen - models: - - model: QwQ-32B - mad_tag: pyt_vllm_qwq-32b - model_repo: Qwen/QwQ-32B - url: https://huggingface.co/Qwen/QwQ-32B - precision: float16 - - model: Qwen3 30B A3B - mad_tag: pyt_vllm_qwen3-30b-a3b - model_repo: Qwen/Qwen3-30B-A3B - url: https://huggingface.co/Qwen/Qwen3-30B-A3B - precision: float16 - - group: Microsoft Phi - tag: phi - models: - - model: Phi-4 - mad_tag: pyt_vllm_phi-4 - model_repo: microsoft/phi-4 - url: https://huggingface.co/microsoft/phi-4 +dockers: + - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c + components: + ROCm: 6.4.1 + vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641) + PyTorch: 2.7.0+gitf717b2a + hipBLASLt: 0.15 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 70B + mad_tag: pyt_vllm_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 4096 + max_num_batched_tokens: 4096 + max_model_len: 4096 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 70B FP8 + mad_tag: pyt_vllm_llama-3.1-70b_fp8 + model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 65536 + max_num_batched_tokens: 65536 + max_model_len: 8192 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 65536 + max_num_batched_tokens: 65536 + max_model_len: 8192 + - group: Qwen + tag: qwen + models: + - model: QwQ-32B + mad_tag: pyt_vllm_qwq-32b + model_repo: Qwen/QwQ-32B + url: https://huggingface.co/Qwen/QwQ-32B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Qwen3 30B A3B + mad_tag: pyt_vllm_qwen3-30b-a3b + model_repo: Qwen/Qwen3-30B-A3B + url: https://huggingface.co/Qwen/Qwen3-30B-A3B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 16384 + max_num_batched_tokens: 16384 + max_model_len: 8192 diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst new file mode 100644 index 000000000..68d7f66e7 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst @@ -0,0 +1,445 @@ +:orphan: + +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the + ROCm vLLM Docker image. + :keywords: model, MAD, automation, dashboarding, validate + +********************************** +vLLM inference performance testing +********************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See :doc:`../vllm` for the latest version. + +.. _vllm-benchmark-unified-docker-812: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers + a prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM + Docker image integrates vLLM and PyTorch tailored specifically for MI300X series + accelerators and includes the following components: + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + * - `ROCm `__ + - {{ unified_docker.rocm_version }} + + * - `vLLM `__ + - {{ unified_docker.vllm_version }} + + * - `PyTorch `__ + - {{ unified_docker.pytorch_version }} + + * - `hipBLASLt `__ + - {{ unified_docker.hipblaslt_version }} + +With this Docker image, you can quickly test the :ref:`expected +inference performance numbers ` for +MI300X series accelerators. + +What's new +========== + +The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. + +* Upgraded to vLLM v0.10. + +* FP8 KV cache support via AITER. + +* Full graph capture support via AITER. + +Supported models +================ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + .. _vllm-benchmark-available-models-812: + + The following models are supported for inference performance benchmarking + with vLLM and ROCm. Some instructions, commands, and recommendations in this + documentation might vary by model -- select one to get started. + + .. raw:: html + +
+
+
Model group
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. _vllm-benchmark-vllm-812: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization prior to use via an external license agreement through a third party. + + {% endfor %} + {% endfor %} + +.. note:: + + vLLM is a toolkit and library for LLM inference and serving. AMD implements + high-performance custom kernels and modules in vLLM to enhance performance. + See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + more information. + +.. _vllm-benchmark-performance-measurements-812: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and serving measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml + + {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} + {% set model_groups = data.vllm_benchmark.model_groups %} + + Pull the Docker image + ===================== + + Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_. + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + + Benchmarking + ============ + + Once the setup is complete, choose between two options to reproduce the + benchmark results: + + .. _vllm-benchmark-mad-812: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model + using one GPU with the :literal:`{{model.precision}}` data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + Although the :ref:`available models + ` are preconfigured to collect + offline throughput and online serving performance data, you can + also change the benchmarking parameters. See the standalone + benchmarking tab for more information. + + {% if model.tunableop %} + + .. note:: + + For improved performance, consider enabling :ref:`PyTorch TunableOp `. + TunableOp automatically explores different implementations and configurations of certain PyTorch + operators to find the fastest one for your hardware. + + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see + ``__). To enable it, include + the ``--tunableop on`` argument in your run. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the + performance-collection run. + + {% endif %} + + .. tab-item:: Standalone benchmarking + + .. rubric:: Download the Docker image and required scripts + + 1. Run the vLLM benchmark tool independently by starting the + `Docker container <{{ unified_docker.docker_hub_url }}>`_ + as shown in the following snippet. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + docker run -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --shm-size 16G \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --cap-add=SYS_PTRACE \ + -v $(pwd):/workspace \ + --env HUGGINGFACE_HUB_CACHE=/workspace \ + --name test \ + {{ unified_docker.pull_tag }} + + 2. In the Docker container, clone the ROCm MAD repository and navigate to the + benchmark scripts directory at ``~/MAD/scripts/vllm``. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/vllm + + 3. To start the benchmark, use the following command with the appropriate options. + + .. code-block:: + + ./run.sh \ + --config $CONFIG_CSV \ + --model_repo {{ model.model_repo }} \ + + + .. dropdown:: Benchmark options + :open: + + .. list-table:: + :header-rows: 1 + :align: center + + * - Name + - Options + - Description + + * - ``--config`` + - ``configs/default.csv`` + - Run configs from the CSV for the chosen model repo and benchmark. + + * - + - ``configs/extended.csv`` + - + + * - + - ``configs/performance.csv`` + - + + * - ``--benchmark`` + - ``throughput`` + - Measure offline end-to-end throughput. + + * - + - ``serving`` + - Measure online serving performance. + + * - + - ``all`` + - Measure both throughput and serving. + + * - `` + - See `run.sh `__ for more info. + - Additional overrides to the config CSV. + + The input sequence length, output sequence length, and tensor parallel (TP) are + already configured. You don't need to specify them with this script. + + .. note:: + + For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``. + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. + + .. code-block:: + + OSError: You are trying to access a gated repo. + + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token + + .. rubric:: Benchmarking examples + + Here are some examples of running the benchmark with various options: + + * Throughput benchmark + + Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. + + .. code-block:: shell + + export MAD_MODEL_NAME={{ model.mad_tag }} + ./run.sh \ + --config configs/default.csv \ + --model_repo {{model.model_repo}} \ + --benchmark throughput + + Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``. + + * Serving benchmark + + Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. + + .. code-block:: + + export MAD_MODEL_NAME={{ model.mad_tag }} + ./run.sh \ + --config configs/default.csv \ + --model_repo {{model.model_repo}} \ + --benchmark serving + + Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``. + + .. raw:: html + + + + .. note:: + + Throughput is calculated as: + + - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time + + - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time + {% endfor %} + {% endfor %} + +Advanced usage +============== + +For information on experimental features and known issues related to ROCm optimization efforts on vLLM, +see the developer's guide at ``__. + +Reproducing the Docker image +---------------------------- + +To reproduce this ROCm/vLLM Docker image release, follow these steps: + +1. Clone the `vLLM repository `__. + + .. code-block:: shell + + git clone https://github.com/ROCm/vllm.git + +2. Checkout the specific release commit. + + .. code-block:: shell + + cd vllm + git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978 + +3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. + + .. code-block:: shell + + docker build -f docker/Dockerfile.rocm -t vllm-rocm . + +Further reading +=============== + +- To learn more about the options for latency and throughput benchmark scripts, + see ``_. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_. + +- For application performance optimization strategies for HPC and AI workloads, + including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`. + +- To learn how to run community models from Hugging Face on AMD GPUs, see + :doc:`Running models from Hugging Face `. + +- To learn how to fine-tune LLMs and optimize inference, see + :doc:`Fine-tuning LLMs and inference optimization `. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`vllm-history` to find documentation for previous releases +of the ``ROCm/vllm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst index 34df0359d..9f6d001ad 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst @@ -16,7 +16,7 @@ vLLM inference performance testing .. _vllm-benchmark-unified-docker-715: -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} {% set model_groups = data.vllm_benchmark.model_groups %} @@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D Supported models ================ -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} {% set model_groups = data.vllm_benchmark.model_groups %} @@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben `. This suite of tests will help you verify and fine-tune your system's configuration. -.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} {% set model_groups = data.vllm_benchmark.model_groups %} diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index 6f87670ec..857a1ee0b 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -7,7 +7,7 @@ vLLM inference performance testing version history This table lists previous versions of the ROCm vLLM inference Docker image for inference performance testing. For detailed information about available models for benchmarking, see the version-specific documentation. You can find tagged -previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub `__. +previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub `__. .. list-table:: :header-rows: 1 diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst index b9e22bf33..ad8db53c4 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst @@ -31,26 +31,30 @@ PyTorch inference performance testing .. raw:: html
-
-
Model
-
- {% for model_group in model_groups %} -
{{ model_group.group }}
- {% endfor %} -
-
- - + +
{% for model_group in model_groups %} diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst index 340ef975e..1722b2018 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst @@ -2,19 +2,19 @@ :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang :keywords: model, MAD, automation, dashboarding, validate -************************************ -SGLang inference performance testing -************************************ +***************************************************************** +SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B +***************************************************************** .. _sglang-benchmark-unified-docker: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml - {% set unified_docker = data.sglang_benchmark.unified_docker.latest %} + {% set docker = data.dockers[0] %} `SGLang `__ is a high-performance inference and serving engine for large language models (LLMs) and vision models. The - ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__ + ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__ bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series accelerators. It includes the following software components: @@ -24,14 +24,10 @@ SGLang inference performance testing * - Software component - Version - * - `ROCm `__ - - {{ unified_docker.rocm_version }} - - * - `SGLang `__ - - {{ unified_docker.sglang_version }} - - * - `PyTorch `__ - - {{ unified_docker.pytorch_version }} + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} System validation ================= @@ -50,8 +46,8 @@ system's configuration. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml - {% set unified_docker = data.sglang_benchmark.unified_docker.latest %} - {% set model_groups = data.sglang_benchmark.model_groups %} + {% set unified_docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} Pull the Docker image ===================== diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 9f3bd608d..f2b060ebd 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -7,14 +7,13 @@ vLLM inference performance testing ********************************** -.. _vllm-benchmark-unified-docker-812: +.. _vllm-benchmark-unified-docker-909: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml - {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} - {% set model_groups = data.vllm_benchmark.model_groups %} + {% set docker = data.dockers[0] %} - The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers + The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment for validating large language model (LLM) inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for MI300X series @@ -26,20 +25,13 @@ vLLM inference performance testing * - Software component - Version - * - `ROCm `__ - - {{ unified_docker.rocm_version }} - - * - `vLLM `__ - - {{ unified_docker.vllm_version }} - - * - `PyTorch `__ - - {{ unified_docker.pytorch_version }} - - * - `hipBLASLt `__ - - {{ unified_docker.hipblaslt_version }} + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} With this Docker image, you can quickly test the :ref:`expected -inference performance numbers ` for +inference performance numbers ` for MI300X series accelerators. What's new @@ -47,21 +39,23 @@ What's new The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. -* Upgraded to vLLM v0.10. +* Upgraded to vLLM v0.10.1. -* FP8 KV cache support via AITER. +* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance. -* Full graph capture support via AITER. +* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile. + +.. _vllm-benchmark-supported-models-909: Supported models ================ .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml - {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} - {% set model_groups = data.vllm_benchmark.model_groups %} + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} - .. _vllm-benchmark-available-models-812: + .. _vllm-benchmark-available-models-909: The following models are supported for inference performance benchmarking with vLLM and ROCm. Some instructions, commands, and recommendations in this @@ -70,55 +64,51 @@ Supported models .. raw:: html
-
-
Model group
-
- {% for model_group in model_groups %} -
{{ model_group.group }}
- {% endfor %} -
-
- -
-
Model
-
- {% for model_group in model_groups %} - {% set models = model_group.models %} - {% for model in models %} - {% if models|length % 3 == 0 %} -
{{ model.model }}
- {% else %} -
{{ model.model }}
- {% endif %} +
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
{% endfor %} - {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
-
- .. _vllm-benchmark-vllm-812: + .. _vllm-benchmark-vllm-909: {% for model_group in model_groups %} {% for model in model_group.models %} - .. container:: model-doc {{model.mad_tag}} + .. container:: model-doc {{ model.mad_tag }} .. note:: See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. Some models require access authorization prior to use via an external license agreement through a third party. + {% if model.precision == "float8" and model.model_repo.startswith("amd") %} + This model uses FP8 quantization via `AMD Quark `__ for efficient inference on AMD accelerators. + {% endif %} {% endfor %} {% endfor %} -.. note:: - - vLLM is a toolkit and library for LLM inference and serving. AMD implements - high-performance custom kernels and modules in vLLM to enhance performance. - See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for - more information. - -.. _vllm-benchmark-performance-measurements-812: +.. _vllm-benchmark-performance-measurements-909: Performance measurements ======================== @@ -151,18 +141,18 @@ system's configuration. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml - {% set unified_docker = data.vllm_benchmark.unified_docker.latest %} - {% set model_groups = data.vllm_benchmark.model_groups %} + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} Pull the Docker image ===================== - Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_. + Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_. Use the following command to pull the Docker image from Docker Hub. .. code-block:: shell - docker pull {{ unified_docker.pull_tag }} + docker pull {{ docker.pull_tag }} Benchmarking ============ @@ -170,7 +160,7 @@ system's configuration. Once the setup is complete, choose between two options to reproduce the benchmark results: - .. _vllm-benchmark-mad-812: + .. _vllm-benchmark-mad-909: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -181,6 +171,9 @@ system's configuration. .. tab-item:: MAD-integrated benchmarking + The following run command is tailored to {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -208,7 +201,7 @@ system's configuration. and ``{{ model.mad_tag }}_serving.csv``. Although the :ref:`available models - ` are preconfigured to collect + ` are preconfigured to collect offline throughput and online serving performance data, you can also change the benchmarking parameters. See the standalone benchmarking tab for more information. @@ -232,132 +225,143 @@ system's configuration. .. tab-item:: Standalone benchmarking - .. rubric:: Download the Docker image and required scripts + The following commands are optimized for {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. - 1. Run the vLLM benchmark tool independently by starting the - `Docker container <{{ unified_docker.docker_hub_url }}>`_ - as shown in the following snippet. + .. seealso:: + + For more information on configuration, see the `config files + `__ + in the MAD repository. Refer to the `vLLM engine `__ + for descriptions of available configuration options + and `Benchmarking vLLM `__ for + additional benchmarking information. + + .. rubric:: Launch the container + + You can run the vLLM benchmark tool independently by starting the + `Docker container <{{ docker.docker_hub_url }}>`_ as shown + in the following snippet. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + docker run -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --shm-size 16G \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --cap-add=SYS_PTRACE \ + -v $(pwd):/workspace \ + --env HUGGINGFACE_HUB_CACHE=/workspace \ + --name test \ + {{ docker.pull_tag }} + + .. rubric:: Throughput command + + Use the following command to start the throughput benchmark. + + .. code-block:: shell + + model={{ model.model_repo }} + tp={{ model.config.tp }} + num_prompts=1024 + in=128 + out=128 + dtype={{ model.config.dtype }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs=1024 + max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} + + vllm bench throughput --model $model \ + -tp $tp \ + --num-prompts $num_prompts \ + --input-len $in \ + --output-len $out \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-seq-len-to-capture $max_seq_len_to_capture \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --trust-remote-code \ + --output-json ${model}_throughput.json \ + --gpu-memory-utilization 0.9 + + .. rubric:: Serving command + + 1. Start the server using the following command: .. code-block:: shell - docker pull {{ unified_docker.pull_tag }} - docker run -it \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --shm-size 16G \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ - --cap-add=SYS_PTRACE \ - -v $(pwd):/workspace \ - --env HUGGINGFACE_HUB_CACHE=/workspace \ - --name test \ - {{ unified_docker.pull_tag }} + model={{ model.model_repo }} + tp={{ model.config.tp }} + dtype={{ model.config.dtype }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs=256 + max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} - 2. In the Docker container, clone the ROCm MAD repository and navigate to the - benchmark scripts directory at ``~/MAD/scripts/vllm``. + vllm serve $model \ + -tp $tp \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-seq-len-to-capture $max_seq_len_to_capture \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --no-enable-prefix-caching \ + --swap-space 16 \ + --disable-log-requests \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 + + Wait until the model has loaded and the server is ready to accept requests. + + 2. On another terminal on the same machine, run the benchmark: .. code-block:: shell - git clone https://github.com/ROCm/MAD - cd MAD/scripts/vllm + # Connect to the container + docker exec -it test bash - 3. To start the benchmark, use the following command with the appropriate options. + # Wait for the server to start + until curl -s http://localhost:8000/v1/models; do sleep 30; done + + # Run the benchmark + model={{ model.model_repo }} + max_concurrency=1 + num_prompts=10 + in=128 + out=128 + vllm bench serve --model $model \ + --percentile-metrics "ttft,tpot,itl,e2el" \ + --dataset-name random \ + --ignore-eos \ + --max-concurrency $max_concurrency \ + --num-prompts $num_prompts \ + --random-input-len $in \ + --random-output-len $out \ + --trust-remote-code \ + --save-result \ + --result-filename ${model}_serving.json + + .. note:: + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. .. code-block:: - ./run.sh \ - --config $CONFIG_CSV \ - --model_repo {{ model.model_repo }} \ - + OSError: You are trying to access a gated repo. - .. dropdown:: Benchmark options - :open: - - .. list-table:: - :header-rows: 1 - :align: center - - * - Name - - Options - - Description - - * - ``--config`` - - ``configs/default.csv`` - - Run configs from the CSV for the chosen model repo and benchmark. - - * - - - ``configs/extended.csv`` - - - - * - - - ``configs/performance.csv`` - - - - * - ``--benchmark`` - - ``throughput`` - - Measure offline end-to-end throughput. - - * - - - ``serving`` - - Measure online serving performance. - - * - - - ``all`` - - Measure both throughput and serving. - - * - `` - - See `run.sh `__ for more info. - - Additional overrides to the config CSV. - - The input sequence length, output sequence length, and tensor parallel (TP) are - already configured. You don't need to specify them with this script. - - .. note:: - - For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``. - - If you encounter the following error, pass your access-authorized Hugging - Face token to the gated models. - - .. code-block:: - - OSError: You are trying to access a gated repo. - - # pass your HF_TOKEN - export HF_TOKEN=$your_personal_hf_token - - .. rubric:: Benchmarking examples - - Here are some examples of running the benchmark with various options: - - * Throughput benchmark - - Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. - - .. code-block:: shell - - export MAD_MODEL_NAME={{ model.mad_tag }} - ./run.sh \ - --config configs/default.csv \ - --model_repo {{model.model_repo}} \ - --benchmark throughput - - Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``. - - * Serving benchmark - - Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision. - - .. code-block:: - - export MAD_MODEL_NAME={{ model.mad_tag }} - ./run.sh \ - --config configs/default.csv \ - --model_repo {{model.model_repo}} \ - --benchmark serving - - Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``. + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token .. raw:: html @@ -382,7 +386,7 @@ Advanced usage ============== For information on experimental features and known issues related to ROCm optimization efforts on vLLM, -see the developer's guide at ``__. +see the developer's guide at ``__. Reproducing the Docker image ---------------------------- @@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps: .. code-block:: shell cd vllm - git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978 + git checkout 6663000a391911eba96d7864a26ac42b07f6ef29 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. @@ -419,15 +423,12 @@ Further reading - To learn more about system settings and management practices to configure your system for AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_. +- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + a brief introduction to vLLM and optimization strategies. + - For application performance optimization strategies for HPC and AI workloads, including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`. -- To learn how to run community models from Hugging Face on AMD GPUs, see - :doc:`Running models from Hugging Face `. - -- To learn how to fine-tune LLMs and optimize inference, see - :doc:`Fine-tuning LLMs and inference optimization `. - - For a list of other ready-made Docker images for AI with ROCm, see `AMD Infinity Hub `_. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst index a85f5af56..76c3582e7 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst @@ -82,32 +82,32 @@ started. {% set model_groups = data.model_groups %} .. raw:: html -
-
-
Model
-
+
+
+
Model
+
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %} -
-
+
+
-
-
Model variant
-
+
+
Variant
+
{% for model_group in model_groups %} {% set models = model_group.models %} {% for model in models %} {% if models|length % 3 == 0 %} -
{{ model.model }}
+
{{ model.model }}
{% else %} -
{{ model.model }}
+
{{ model.model }}
{% endif %} {% endfor %} {% endfor %} -
-
+
+
.. note:: @@ -208,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`. e. RDMA interface - Ensure the :ref:`required packages ` are installed on all nodes. + Ensure the :ref:`required packages ` are installed on all nodes. Then, set the RDMA interfaces to use for communication. .. code-block:: bash diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst index 687cc514f..4df1da960 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst @@ -70,32 +70,32 @@ workloads: {% set model_groups = data.model_groups %} .. raw:: html -
-
-
Model
-
+
+
+
Model
+
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %} -
-
+
+
-
-
Model variant
-
+
+
Variant
+
{% for model_group in model_groups %} {% set models = model_group.models %} {% for model in models %} {% if models|length % 3 == 0 %} -
{{ model.model }}
+
{{ model.model }}
{% else %} -
{{ model.model }}
+
{{ model.model }}
{% endif %} {% endfor %} {% endfor %} -
-
+
+
.. note:: diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst index d5051d28c..9bd7081d2 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst @@ -112,7 +112,7 @@ Multi-node setup For multi-node environments, ensure you have all the necessary packages for your network device, such as, RDMA. If you're not using a multi-node setup -with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`. +with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`. 1. Install the following packages to build and install the RDMA driver. @@ -177,7 +177,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`. e. RDMA interface - Ensure the :ref:`required packages ` are installed on all nodes. + Ensure the :ref:`required packages ` are installed on all nodes. Then, set the RDMA interfaces to use for communication. .. code-block:: bash @@ -199,7 +199,7 @@ Pull the Docker image docker pull rocm/jax-training:maxtext-v25.5 2. Use the following command to launch the Docker container. Note that the benchmarking scripts - used in the :ref:`following section ` automatically launch the Docker container + used in the :ref:`following section ` automatically launch the Docker container and execute the benchmark. .. code-block:: shell diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst index 0a80c7c9b..81ec4ed50 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst @@ -55,32 +55,32 @@ vary by model -- select one to get started. {% set model_groups = data.model_groups %} .. raw:: html -
-
-
Model
-
+
+
+
Model
+
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %} -
-
+
+
-
-
Model variant
-
+
+
Variant
+
{% for model_group in model_groups %} {% set models = model_group.models %} {% for model in models %} {% if models|length % 3 == 0 %} -
{{ model.model }}
+
{{ model.model }}
{% else %} -
{{ model.model }}
+
{{ model.model }}
{% endif %} {% endfor %} {% endfor %} -
-
+
+
.. note:: diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst index e7258e07b..d8ab01318 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst @@ -45,30 +45,30 @@ vary by model -- select one to get started. .. raw:: html
-
-
Model group
-
- {% for model_group in model_groups %} -
{{ model_group.group }}
- {% endfor %} -
-
- -
-
Model variant
-
- {% for model_group in model_groups %} - {% set models = model_group.models %} - {% for model in models %} - {% if models|length % 3 == 0 %} -
{{ model.model }}
- {% else %} -
{{ model.model }}
- {% endif %} +
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
{% endfor %} - {% endfor %} -
-
+
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
diff --git a/docs/sphinx/static/css/vllm-benchmark.css b/docs/sphinx/static/css/vllm-benchmark.css index 4c10b1ffb..231bb2cac 100644 --- a/docs/sphinx/static/css/vllm-benchmark.css +++ b/docs/sphinx/static/css/vllm-benchmark.css @@ -7,15 +7,14 @@ html { --compat-head-color: var(--pst-color-surface); --compat-param-hover-color: var(--pst-color-link-hover); --compat-param-selected-color: var(--pst-color-primary); + --compat-border-color: var(--pst-color-border); } html[data-theme="light"] { - --compat-border-color: var(--pst-gray-500); --compat-param-disabled-color: var(--pst-gray-300); } html[data-theme="dark"] { - --compat-border-color: var(--pst-gray-600); --compat-param-disabled-color: var(--pst-gray-600); } @@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid { padding: 0 0 1rem 0; } +div[data-param-k="model-group"], div[data-param-k="model"] { background-color: var(--compat-bg-color); padding: 2px; @@ -31,40 +31,19 @@ div[data-param-k="model"] { cursor: pointer; } +div[data-param-k="model-group"][data-param-state="selected"], div[data-param-k="model"][data-param-state="selected"] { background-color: var(--compat-param-selected-color); color: var(--compat-fg-color); } -div[data-param-k="model"][data-param-state="latest-version"] { - background-color: var(--compat-param-selected-color); - color: var(--compat-fg-color); -} - -div[data-param-k="model"][data-param-state="disabled"] { - background-color: var(--compat-param-disabled-color); - text-decoration: line-through; - /* text-decoration-color: var(--pst-color-danger); */ - cursor: auto; -} - -div[data-param-k="model"]:not([data-param-state]):hover { +div[data-param-k="model-group"]:hover, +div[data-param-k="model"]:hover { background-color: var(--compat-param-hover-color); -} - -div[data-param-k="model-group"] { - background-color: var(--compat-bg-color); - padding: 2px; - border: solid 1px var(--compat-border-color); - font-weight: 500; - cursor: pointer; -} - -div[data-param-k="model-group"][data-param-state="selected"] { - background-color: var(--compat-param-selected-color); color: var(--compat-fg-color); } +/* div[data-param-k="model-group"][data-param-state="latest-version"] { background-color: var(--compat-param-selected-color); color: var(--compat-fg-color); @@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] { div[data-param-k="model-group"][data-param-state="disabled"] { background-color: var(--compat-param-disabled-color); text-decoration: line-through; - /* text-decoration-color: var(--pst-color-danger); */ + text-decoration-color: var(--pst-color-danger); cursor: auto; } - -div[data-param-k="model-group"]:not([data-param-state]):hover { - background-color: var(--compat-param-hover-color); -} +*/ .model-param-head { background-color: var(--compat-head-color); padding: 0.15rem 0.15rem 0.15rem 0.67rem; - /* margin: 2px; */ - border-right: solid 2px var(--compat-accent-color); + border-right: solid 4px var(--compat-accent-color); font-weight: 600; } .model-param { - /* padding: 2px; */ - /* margin: 0 2px 0 2px; */ - /* margin: 2px; */ border: solid 1px var(--compat-border-color); font-weight: 500; }