mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 22:58:17 -05:00
Update vLLM inference benchmark doc for 0909 release (and Sphinx fixes) (#5289)
This commit is contained in:
@@ -673,6 +673,7 @@ github
|
|||||||
globals
|
globals
|
||||||
gnupg
|
gnupg
|
||||||
grayscale
|
grayscale
|
||||||
|
gx
|
||||||
gzip
|
gzip
|
||||||
heterogenous
|
heterogenous
|
||||||
hipBLAS
|
hipBLAS
|
||||||
@@ -783,6 +784,7 @@ parallelizing
|
|||||||
param
|
param
|
||||||
parameterization
|
parameterization
|
||||||
passthrough
|
passthrough
|
||||||
|
pe
|
||||||
perfcounter
|
perfcounter
|
||||||
performant
|
performant
|
||||||
perl
|
perl
|
||||||
@@ -812,6 +814,7 @@ profiler
|
|||||||
profilers
|
profilers
|
||||||
protobuf
|
protobuf
|
||||||
pseudorandom
|
pseudorandom
|
||||||
|
px
|
||||||
py
|
py
|
||||||
pytorch
|
pytorch
|
||||||
recommender
|
recommender
|
||||||
|
|||||||
@@ -0,0 +1,91 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen3 30B A3B
|
||||||
|
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||||
|
model_repo: Qwen/Qwen3-30B-A3B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
@@ -1,17 +1,16 @@
|
|||||||
sglang_benchmark:
|
dockers:
|
||||||
unified_docker:
|
- pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
||||||
latest:
|
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
||||||
pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
components:
|
||||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
ROCm: 6.3.0
|
||||||
rocm_version: 6.3.0
|
SGLang: 0.4.5 (0.4.5-rocm)
|
||||||
sglang_version: 0.4.5 (0.4.5-rocm)
|
PyTorch: 2.6.0a0+git8d4926e
|
||||||
pytorch_version: 2.6.0a0+git8d4926e
|
model_groups:
|
||||||
model_groups:
|
- group: DeepSeek
|
||||||
- group: DeepSeek
|
tag: deepseek
|
||||||
tag: deepseek
|
models:
|
||||||
models:
|
- model: DeepSeek-R1-Distill-Qwen-32B
|
||||||
- model: DeepSeek-R1-Distill-Qwen-32B
|
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
||||||
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||||
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
precision: bfloat16
|
||||||
precision: bfloat16
|
|
||||||
|
|||||||
@@ -1,92 +1,188 @@
|
|||||||
vllm_benchmark:
|
dockers:
|
||||||
unified_docker:
|
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||||
latest:
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||||
# TODO: update me
|
components:
|
||||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
ROCm: 6.4.1
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||||
rocm_version: 6.4.1
|
PyTorch: 2.7.0+gitf717b2a
|
||||||
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
hipBLASLt: 0.15
|
||||||
pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
|
model_groups:
|
||||||
hipblaslt_version: 0.15
|
- group: Meta Llama
|
||||||
model_groups:
|
tag: llama
|
||||||
- group: Meta Llama
|
models:
|
||||||
tag: llama
|
- model: Llama 3.1 8B
|
||||||
models:
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
- model: Llama 3.1 8B
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
mad_tag: pyt_vllm_llama-3.1-8b
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
precision: float16
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
config:
|
||||||
precision: float16
|
tp: 1
|
||||||
- model: Llama 3.1 70B
|
dtype: auto
|
||||||
mad_tag: pyt_vllm_llama-3.1-70b
|
kv_cache_dtype: auto
|
||||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
max_seq_len_to_capture: 131072
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
max_num_batched_tokens: 131072
|
||||||
precision: float16
|
max_model_len: 8192
|
||||||
- model: Llama 3.1 405B
|
- model: Llama 3.1 70B
|
||||||
mad_tag: pyt_vllm_llama-3.1-405b
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Llama 2 70B
|
config:
|
||||||
mad_tag: pyt_vllm_llama-2-70b
|
tp: 8
|
||||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
dtype: auto
|
||||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
kv_cache_dtype: auto
|
||||||
precision: float16
|
max_seq_len_to_capture: 131072
|
||||||
- model: Llama 3.1 8B FP8
|
max_num_batched_tokens: 131072
|
||||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
max_model_len: 8192
|
||||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
- model: Llama 3.1 405B
|
||||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
precision: float8
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
- model: Llama 3.1 70B FP8
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
precision: float16
|
||||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
config:
|
||||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
tp: 8
|
||||||
precision: float8
|
dtype: auto
|
||||||
- model: Llama 3.1 405B FP8
|
kv_cache_dtype: auto
|
||||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
max_seq_len_to_capture: 131072
|
||||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
max_num_batched_tokens: 131072
|
||||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
max_model_len: 8192
|
||||||
precision: float8
|
- model: Llama 2 70B
|
||||||
- group: Mistral AI
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
tag: mistral
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
models:
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
- model: Mixtral MoE 8x7B
|
precision: float16
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b
|
config:
|
||||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
tp: 8
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
dtype: auto
|
||||||
precision: float16
|
kv_cache_dtype: auto
|
||||||
- model: Mixtral MoE 8x22B
|
max_seq_len_to_capture: 4096
|
||||||
mad_tag: pyt_vllm_mixtral-8x22b
|
max_num_batched_tokens: 4096
|
||||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
max_model_len: 4096
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
- model: Llama 3.1 8B FP8
|
||||||
precision: float16
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
- model: Mixtral MoE 8x7B FP8
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
precision: float8
|
||||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
config:
|
||||||
precision: float8
|
tp: 1
|
||||||
- model: Mixtral MoE 8x22B FP8
|
dtype: auto
|
||||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
kv_cache_dtype: fp8
|
||||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
max_seq_len_to_capture: 131072
|
||||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
max_num_batched_tokens: 131072
|
||||||
precision: float8
|
max_model_len: 8192
|
||||||
- group: Qwen
|
- model: Llama 3.1 70B FP8
|
||||||
tag: qwen
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
models:
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
- model: QwQ-32B
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
mad_tag: pyt_vllm_qwq-32b
|
precision: float8
|
||||||
model_repo: Qwen/QwQ-32B
|
config:
|
||||||
url: https://huggingface.co/Qwen/QwQ-32B
|
tp: 8
|
||||||
precision: float16
|
dtype: auto
|
||||||
- model: Qwen3 30B A3B
|
kv_cache_dtype: fp8
|
||||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
max_seq_len_to_capture: 131072
|
||||||
model_repo: Qwen/Qwen3-30B-A3B
|
max_num_batched_tokens: 131072
|
||||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
max_model_len: 8192
|
||||||
precision: float16
|
- model: Llama 3.1 405B FP8
|
||||||
- group: Microsoft Phi
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
tag: phi
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
models:
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
- model: Phi-4
|
precision: float8
|
||||||
mad_tag: pyt_vllm_phi-4
|
config:
|
||||||
model_repo: microsoft/phi-4
|
tp: 8
|
||||||
url: https://huggingface.co/microsoft/phi-4
|
dtype: auto
|
||||||
|
kv_cache_dtype: fp8
|
||||||
|
max_seq_len_to_capture: 131072
|
||||||
|
max_num_batched_tokens: 131072
|
||||||
|
max_model_len: 8192
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
config:
|
||||||
|
tp: 8
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 32768
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
max_model_len: 8192
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
config:
|
||||||
|
tp: 8
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 65536
|
||||||
|
max_num_batched_tokens: 65536
|
||||||
|
max_model_len: 8192
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
config:
|
||||||
|
tp: 8
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: fp8
|
||||||
|
max_seq_len_to_capture: 32768
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
max_model_len: 8192
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
config:
|
||||||
|
tp: 8
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: fp8
|
||||||
|
max_seq_len_to_capture: 65536
|
||||||
|
max_num_batched_tokens: 65536
|
||||||
|
max_model_len: 8192
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
config:
|
||||||
|
tp: 1
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 131072
|
||||||
|
max_num_batched_tokens: 131072
|
||||||
|
max_model_len: 8192
|
||||||
|
- model: Qwen3 30B A3B
|
||||||
|
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||||
|
model_repo: Qwen/Qwen3-30B-A3B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||||
|
precision: float16
|
||||||
|
config:
|
||||||
|
tp: 1
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 32768
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
max_model_len: 8192
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
config:
|
||||||
|
tp: 1
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 16384
|
||||||
|
max_num_batched_tokens: 16384
|
||||||
|
max_model_len: 8192
|
||||||
|
|||||||
@@ -0,0 +1,445 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker-812:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
|
- {{ unified_docker.pytorch_version }}
|
||||||
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||||
|
|
||||||
|
* Upgraded to vLLM v0.10.
|
||||||
|
|
||||||
|
* FP8 KV cache support via AITER.
|
||||||
|
|
||||||
|
* Full graph capture support via AITER.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-812:
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm-812:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-812:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad-812:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||||
|
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||||
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
|
Although the :ref:`available models
|
||||||
|
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||||
|
offline throughput and online serving performance data, you can
|
||||||
|
also change the benchmarking parameters. See the standalone
|
||||||
|
benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||||
|
the ``--tunableop on`` argument in your run.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||||
|
performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./run.sh \
|
||||||
|
--config $CONFIG_CSV \
|
||||||
|
--model_repo {{ model.model_repo }} \
|
||||||
|
<overrides>
|
||||||
|
|
||||||
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``--config``
|
||||||
|
- ``configs/default.csv``
|
||||||
|
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``configs/extended.csv``
|
||||||
|
-
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``configs/performance.csv``
|
||||||
|
-
|
||||||
|
|
||||||
|
* - ``--benchmark``
|
||||||
|
- ``throughput``
|
||||||
|
- Measure offline end-to-end throughput.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``serving``
|
||||||
|
- Measure online serving performance.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``all``
|
||||||
|
- Measure both throughput and serving.
|
||||||
|
|
||||||
|
* - `<overrides>`
|
||||||
|
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||||
|
- Additional overrides to the config CSV.
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark throughput
|
||||||
|
|
||||||
|
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||||
|
|
||||||
|
* Serving benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark serving
|
||||||
|
|
||||||
|
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -16,7 +16,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
.. _vllm-benchmark-unified-docker-715:
|
.. _vllm-benchmark-unified-docker-715:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
|
|||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
|||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
system's configuration.
|
system's configuration.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
|
|||||||
This table lists previous versions of the ROCm vLLM inference Docker image for
|
This table lists previous versions of the ROCm vLLM inference Docker image for
|
||||||
inference performance testing. For detailed information about available models
|
inference performance testing. For detailed information about available models
|
||||||
for benchmarking, see the version-specific documentation. You can find tagged
|
for benchmarking, see the version-specific documentation. You can find tagged
|
||||||
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
|
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|||||||
@@ -31,26 +31,30 @@ PyTorch inference performance testing
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1" style="display: none;">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
<div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
<div class="row gx-0 pt-1" style="display: none;">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
|
|||||||
@@ -2,19 +2,19 @@
|
|||||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
|
||||||
:keywords: model, MAD, automation, dashboarding, validate
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
************************************
|
*****************************************************************
|
||||||
SGLang inference performance testing
|
SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
|
||||||
************************************
|
*****************************************************************
|
||||||
|
|
||||||
.. _sglang-benchmark-unified-docker:
|
.. _sglang-benchmark-unified-docker:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
|
|
||||||
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
|
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
|
||||||
serving engine for large language models (LLMs) and vision models. The
|
serving engine for large language models (LLMs) and vision models. The
|
||||||
ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
|
ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
|
||||||
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
|
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
|
||||||
accelerators. It includes the following software components:
|
accelerators. It includes the following software components:
|
||||||
|
|
||||||
@@ -24,14 +24,10 @@ SGLang inference performance testing
|
|||||||
* - Software component
|
* - Software component
|
||||||
- Version
|
- Version
|
||||||
|
|
||||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
- {{ unified_docker.rocm_version }}
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
* - `SGLang <https://docs.sglang.ai/index.html>`__
|
{% endfor %}
|
||||||
- {{ unified_docker.sglang_version }}
|
|
||||||
|
|
||||||
* - `PyTorch <https://github.com/pytorch/pytorch>`__
|
|
||||||
- {{ unified_docker.pytorch_version }}
|
|
||||||
|
|
||||||
System validation
|
System validation
|
||||||
=================
|
=================
|
||||||
@@ -50,8 +46,8 @@ system's configuration.
|
|||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.sglang_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
|
|||||||
@@ -7,14 +7,13 @@
|
|||||||
vLLM inference performance testing
|
vLLM inference performance testing
|
||||||
**********************************
|
**********************************
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker-812:
|
.. _vllm-benchmark-unified-docker-909:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
|
||||||
|
|
||||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
|
||||||
a prebuilt, optimized environment for validating large language model (LLM)
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
@@ -26,20 +25,13 @@ vLLM inference performance testing
|
|||||||
* - Software component
|
* - Software component
|
||||||
- Version
|
- Version
|
||||||
|
|
||||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
- {{ unified_docker.rocm_version }}
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
{% endfor %}
|
||||||
- {{ unified_docker.vllm_version }}
|
|
||||||
|
|
||||||
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
|
||||||
- {{ unified_docker.pytorch_version }}
|
|
||||||
|
|
||||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
|
||||||
- {{ unified_docker.hipblaslt_version }}
|
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-909>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
What's new
|
What's new
|
||||||
@@ -47,21 +39,23 @@ What's new
|
|||||||
|
|
||||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||||
|
|
||||||
* Upgraded to vLLM v0.10.
|
* Upgraded to vLLM v0.10.1.
|
||||||
|
|
||||||
* FP8 KV cache support via AITER.
|
* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
|
||||||
|
|
||||||
* Full graph capture support via AITER.
|
* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-supported-models-909:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models-812:
|
.. _vllm-benchmark-available-models-909:
|
||||||
|
|
||||||
The following models are supported for inference performance benchmarking
|
The following models are supported for inference performance benchmarking
|
||||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
@@ -70,55 +64,51 @@ Supported models
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm-812:
|
.. _vllm-benchmark-vllm-909:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
.. container:: model-doc {{model.mad_tag}}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||||
|
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. note::
|
.. _vllm-benchmark-performance-measurements-909:
|
||||||
|
|
||||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
|
||||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
|
||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
|
||||||
more information.
|
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements-812:
|
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -151,18 +141,18 @@ system's configuration.
|
|||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
Benchmarking
|
Benchmarking
|
||||||
============
|
============
|
||||||
@@ -170,7 +160,7 @@ system's configuration.
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad-812:
|
.. _vllm-benchmark-mad-909:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -181,6 +171,9 @@ system's configuration.
|
|||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
The following run command is tailored to {{ model.model }}.
|
||||||
|
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
@@ -208,7 +201,7 @@ system's configuration.
|
|||||||
and ``{{ model.mad_tag }}_serving.csv``.
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
Although the :ref:`available models
|
Although the :ref:`available models
|
||||||
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
<vllm-benchmark-available-models-909>` are preconfigured to collect
|
||||||
offline throughput and online serving performance data, you can
|
offline throughput and online serving performance data, you can
|
||||||
also change the benchmarking parameters. See the standalone
|
also change the benchmarking parameters. See the standalone
|
||||||
benchmarking tab for more information.
|
benchmarking tab for more information.
|
||||||
@@ -232,132 +225,143 @@ system's configuration.
|
|||||||
|
|
||||||
.. tab-item:: Standalone benchmarking
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
.. rubric:: Download the Docker image and required scripts
|
The following commands are optimized for {{ model.model }}.
|
||||||
|
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||||
|
|
||||||
1. Run the vLLM benchmark tool independently by starting the
|
.. seealso::
|
||||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
|
||||||
as shown in the following snippet.
|
For more information on configuration, see the `config files
|
||||||
|
<https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
|
||||||
|
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||||
|
for descriptions of available configuration options
|
||||||
|
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||||
|
additional benchmarking information.
|
||||||
|
|
||||||
|
.. rubric:: Launch the container
|
||||||
|
|
||||||
|
You can run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||||
|
in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Throughput command
|
||||||
|
|
||||||
|
Use the following command to start the throughput benchmark.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
model={{ model.model_repo }}
|
||||||
|
tp={{ model.config.tp }}
|
||||||
|
num_prompts=1024
|
||||||
|
in=128
|
||||||
|
out=128
|
||||||
|
dtype={{ model.config.dtype }}
|
||||||
|
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||||
|
max_num_seqs=1024
|
||||||
|
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||||
|
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||||
|
max_model_len={{ model.config.max_model_len }}
|
||||||
|
|
||||||
|
vllm bench throughput --model $model \
|
||||||
|
-tp $tp \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--input-len $in \
|
||||||
|
--output-len $out \
|
||||||
|
--dtype $dtype \
|
||||||
|
--kv-cache-dtype $kv_cache_dtype \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--max-model-len $max_model_len \
|
||||||
|
--trust-remote-code \
|
||||||
|
--output-json ${model}_throughput.json \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
|
||||||
|
.. rubric:: Serving command
|
||||||
|
|
||||||
|
1. Start the server using the following command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
model={{ model.model_repo }}
|
||||||
docker run -it \
|
tp={{ model.config.tp }}
|
||||||
--device=/dev/kfd \
|
dtype={{ model.config.dtype }}
|
||||||
--device=/dev/dri \
|
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||||
--group-add video \
|
max_num_seqs=256
|
||||||
--shm-size 16G \
|
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||||
--security-opt seccomp=unconfined \
|
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||||
--security-opt apparmor=unconfined \
|
max_model_len={{ model.config.max_model_len }}
|
||||||
--cap-add=SYS_PTRACE \
|
|
||||||
-v $(pwd):/workspace \
|
|
||||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
|
||||||
--name test \
|
|
||||||
{{ unified_docker.pull_tag }}
|
|
||||||
|
|
||||||
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
vllm serve $model \
|
||||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
-tp $tp \
|
||||||
|
--dtype $dtype \
|
||||||
|
--kv-cache-dtype $kv_cache_dtype \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--max-model-len $max_model_len \
|
||||||
|
--no-enable-prefix-caching \
|
||||||
|
--swap-space 16 \
|
||||||
|
--disable-log-requests \
|
||||||
|
--trust-remote-code \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
|
||||||
|
Wait until the model has loaded and the server is ready to accept requests.
|
||||||
|
|
||||||
|
2. On another terminal on the same machine, run the benchmark:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
# Connect to the container
|
||||||
cd MAD/scripts/vllm
|
docker exec -it test bash
|
||||||
|
|
||||||
3. To start the benchmark, use the following command with the appropriate options.
|
# Wait for the server to start
|
||||||
|
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||||
|
|
||||||
|
# Run the benchmark
|
||||||
|
model={{ model.model_repo }}
|
||||||
|
max_concurrency=1
|
||||||
|
num_prompts=10
|
||||||
|
in=128
|
||||||
|
out=128
|
||||||
|
vllm bench serve --model $model \
|
||||||
|
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||||
|
--dataset-name random \
|
||||||
|
--ignore-eos \
|
||||||
|
--max-concurrency $max_concurrency \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--random-input-len $in \
|
||||||
|
--random-output-len $out \
|
||||||
|
--trust-remote-code \
|
||||||
|
--save-result \
|
||||||
|
--result-filename ${model}_serving.json
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
./run.sh \
|
OSError: You are trying to access a gated repo.
|
||||||
--config $CONFIG_CSV \
|
|
||||||
--model_repo {{ model.model_repo }} \
|
|
||||||
<overrides>
|
|
||||||
|
|
||||||
.. dropdown:: Benchmark options
|
# pass your HF_TOKEN
|
||||||
:open:
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
:align: center
|
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
* - ``--config``
|
|
||||||
- ``configs/default.csv``
|
|
||||||
- Run configs from the CSV for the chosen model repo and benchmark.
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``configs/extended.csv``
|
|
||||||
-
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``configs/performance.csv``
|
|
||||||
-
|
|
||||||
|
|
||||||
* - ``--benchmark``
|
|
||||||
- ``throughput``
|
|
||||||
- Measure offline end-to-end throughput.
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``serving``
|
|
||||||
- Measure online serving performance.
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``all``
|
|
||||||
- Measure both throughput and serving.
|
|
||||||
|
|
||||||
* - `<overrides>`
|
|
||||||
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
|
||||||
- Additional overrides to the config CSV.
|
|
||||||
|
|
||||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
|
||||||
already configured. You don't need to specify them with this script.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
|
||||||
|
|
||||||
If you encounter the following error, pass your access-authorized Hugging
|
|
||||||
Face token to the gated models.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
OSError: You are trying to access a gated repo.
|
|
||||||
|
|
||||||
# pass your HF_TOKEN
|
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options:
|
|
||||||
|
|
||||||
* Throughput benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
|
||||||
./run.sh \
|
|
||||||
--config configs/default.csv \
|
|
||||||
--model_repo {{model.model_repo}} \
|
|
||||||
--benchmark throughput
|
|
||||||
|
|
||||||
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
|
||||||
|
|
||||||
* Serving benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
|
||||||
./run.sh \
|
|
||||||
--config configs/default.csv \
|
|
||||||
--model_repo {{model.model_repo}} \
|
|
||||||
--benchmark serving
|
|
||||||
|
|
||||||
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
@@ -382,7 +386,7 @@ Advanced usage
|
|||||||
==============
|
==============
|
||||||
|
|
||||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||||
|
|
||||||
Reproducing the Docker image
|
Reproducing the Docker image
|
||||||
----------------------------
|
----------------------------
|
||||||
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
cd vllm
|
cd vllm
|
||||||
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
|
||||||
|
|
||||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
@@ -419,15 +423,12 @@ Further reading
|
|||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
a brief introduction to vLLM and optimization strategies.
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
|
||||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs and optimize inference, see
|
|
||||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
|||||||
@@ -82,32 +82,32 @@ started.
|
|||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row gx-0 pt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
{% if models|length % 3 == 0 %}
|
{% if models|length % 3 == 0 %}
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -208,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
|
|||||||
|
|
||||||
e. RDMA interface
|
e. RDMA interface
|
||||||
|
|
||||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
|
||||||
Then, set the RDMA interfaces to use for communication.
|
Then, set the RDMA interfaces to use for communication.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|||||||
@@ -70,32 +70,32 @@ workloads:
|
|||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row gx-0 pt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
{% if models|length % 3 == 0 %}
|
{% if models|length % 3 == 0 %}
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ Multi-node setup
|
|||||||
|
|
||||||
For multi-node environments, ensure you have all the necessary packages for
|
For multi-node environments, ensure you have all the necessary packages for
|
||||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
|
||||||
|
|
||||||
1. Install the following packages to build and install the RDMA driver.
|
1. Install the following packages to build and install the RDMA driver.
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
|||||||
|
|
||||||
e. RDMA interface
|
e. RDMA interface
|
||||||
|
|
||||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
|
||||||
Then, set the RDMA interfaces to use for communication.
|
Then, set the RDMA interfaces to use for communication.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
@@ -199,7 +199,7 @@ Pull the Docker image
|
|||||||
docker pull rocm/jax-training:maxtext-v25.5
|
docker pull rocm/jax-training:maxtext-v25.5
|
||||||
|
|
||||||
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
||||||
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
|
used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
|
||||||
and execute the benchmark.
|
and execute the benchmark.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|||||||
@@ -55,32 +55,32 @@ vary by model -- select one to get started.
|
|||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row gx-0 pt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
{% if models|length % 3 == 0 %}
|
{% if models|length % 3 == 0 %}
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
|||||||
@@ -45,30 +45,30 @@ vary by model -- select one to get started.
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,15 +7,14 @@ html {
|
|||||||
--compat-head-color: var(--pst-color-surface);
|
--compat-head-color: var(--pst-color-surface);
|
||||||
--compat-param-hover-color: var(--pst-color-link-hover);
|
--compat-param-hover-color: var(--pst-color-link-hover);
|
||||||
--compat-param-selected-color: var(--pst-color-primary);
|
--compat-param-selected-color: var(--pst-color-primary);
|
||||||
|
--compat-border-color: var(--pst-color-border);
|
||||||
}
|
}
|
||||||
|
|
||||||
html[data-theme="light"] {
|
html[data-theme="light"] {
|
||||||
--compat-border-color: var(--pst-gray-500);
|
|
||||||
--compat-param-disabled-color: var(--pst-gray-300);
|
--compat-param-disabled-color: var(--pst-gray-300);
|
||||||
}
|
}
|
||||||
|
|
||||||
html[data-theme="dark"] {
|
html[data-theme="dark"] {
|
||||||
--compat-border-color: var(--pst-gray-600);
|
|
||||||
--compat-param-disabled-color: var(--pst-gray-600);
|
--compat-param-disabled-color: var(--pst-gray-600);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
|
|||||||
padding: 0 0 1rem 0;
|
padding: 0 0 1rem 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
div[data-param-k="model-group"],
|
||||||
div[data-param-k="model"] {
|
div[data-param-k="model"] {
|
||||||
background-color: var(--compat-bg-color);
|
background-color: var(--compat-bg-color);
|
||||||
padding: 2px;
|
padding: 2px;
|
||||||
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
|
|||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
div[data-param-k="model-group"][data-param-state="selected"],
|
||||||
div[data-param-k="model"][data-param-state="selected"] {
|
div[data-param-k="model"][data-param-state="selected"] {
|
||||||
background-color: var(--compat-param-selected-color);
|
background-color: var(--compat-param-selected-color);
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
div[data-param-k="model"][data-param-state="latest-version"] {
|
div[data-param-k="model-group"]:hover,
|
||||||
background-color: var(--compat-param-selected-color);
|
div[data-param-k="model"]:hover {
|
||||||
color: var(--compat-fg-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model"][data-param-state="disabled"] {
|
|
||||||
background-color: var(--compat-param-disabled-color);
|
|
||||||
text-decoration: line-through;
|
|
||||||
/* text-decoration-color: var(--pst-color-danger); */
|
|
||||||
cursor: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model"]:not([data-param-state]):hover {
|
|
||||||
background-color: var(--compat-param-hover-color);
|
background-color: var(--compat-param-hover-color);
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model-group"] {
|
|
||||||
background-color: var(--compat-bg-color);
|
|
||||||
padding: 2px;
|
|
||||||
border: solid 1px var(--compat-border-color);
|
|
||||||
font-weight: 500;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model-group"][data-param-state="selected"] {
|
|
||||||
background-color: var(--compat-param-selected-color);
|
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||||
background-color: var(--compat-param-selected-color);
|
background-color: var(--compat-param-selected-color);
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
|
|||||||
div[data-param-k="model-group"][data-param-state="disabled"] {
|
div[data-param-k="model-group"][data-param-state="disabled"] {
|
||||||
background-color: var(--compat-param-disabled-color);
|
background-color: var(--compat-param-disabled-color);
|
||||||
text-decoration: line-through;
|
text-decoration: line-through;
|
||||||
/* text-decoration-color: var(--pst-color-danger); */
|
text-decoration-color: var(--pst-color-danger);
|
||||||
cursor: auto;
|
cursor: auto;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
div[data-param-k="model-group"]:not([data-param-state]):hover {
|
|
||||||
background-color: var(--compat-param-hover-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.model-param-head {
|
.model-param-head {
|
||||||
background-color: var(--compat-head-color);
|
background-color: var(--compat-head-color);
|
||||||
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
||||||
/* margin: 2px; */
|
border-right: solid 4px var(--compat-accent-color);
|
||||||
border-right: solid 2px var(--compat-accent-color);
|
|
||||||
font-weight: 600;
|
font-weight: 600;
|
||||||
}
|
}
|
||||||
|
|
||||||
.model-param {
|
.model-param {
|
||||||
/* padding: 2px; */
|
|
||||||
/* margin: 0 2px 0 2px; */
|
|
||||||
/* margin: 2px; */
|
|
||||||
border: solid 1px var(--compat-border-color);
|
border: solid 1px var(--compat-border-color);
|
||||||
font-weight: 500;
|
font-weight: 500;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user