mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 06:38:00 -05:00
Update vLLM inference benchmark doc for 0909 release (and Sphinx fixes) (#5289)
This commit is contained in:
@@ -673,6 +673,7 @@ github
|
||||
globals
|
||||
gnupg
|
||||
grayscale
|
||||
gx
|
||||
gzip
|
||||
heterogenous
|
||||
hipBLAS
|
||||
@@ -783,6 +784,7 @@ parallelizing
|
||||
param
|
||||
parameterization
|
||||
passthrough
|
||||
pe
|
||||
perfcounter
|
||||
performant
|
||||
perl
|
||||
@@ -812,6 +814,7 @@ profiler
|
||||
profilers
|
||||
protobuf
|
||||
pseudorandom
|
||||
px
|
||||
py
|
||||
pytorch
|
||||
recommender
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
@@ -1,17 +1,16 @@
|
||||
sglang_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
||||
rocm_version: 6.3.0
|
||||
sglang_version: 0.4.5 (0.4.5-rocm)
|
||||
pytorch_version: 2.6.0a0+git8d4926e
|
||||
model_groups:
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-R1-Distill-Qwen-32B
|
||||
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
||||
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
precision: bfloat16
|
||||
dockers:
|
||||
- pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
||||
components:
|
||||
ROCm: 6.3.0
|
||||
SGLang: 0.4.5 (0.4.5-rocm)
|
||||
PyTorch: 2.6.0a0+git8d4926e
|
||||
model_groups:
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-R1-Distill-Qwen-32B
|
||||
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
||||
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
precision: bfloat16
|
||||
|
||||
@@ -1,92 +1,188 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
# TODO: update me
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||
PyTorch: 2.7.0+gitf717b2a
|
||||
hipBLASLt: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 4096
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 16384
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
|
||||
@@ -0,0 +1,445 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||
ROCm vLLM Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker-812:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||
accelerators and includes the following components:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||
- {{ unified_docker.rocm_version }}
|
||||
|
||||
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||
- {{ unified_docker.vllm_version }}
|
||||
|
||||
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||
- {{ unified_docker.pytorch_version }}
|
||||
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- {{ unified_docker.hipblaslt_version }}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||
MI300X series accelerators.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||
|
||||
* Upgraded to vLLM v0.10.
|
||||
|
||||
* FP8 KV cache support via AITER.
|
||||
|
||||
* Full graph capture support via AITER.
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-812:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
documentation might vary by model -- select one to get started.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-812:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-812:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-812:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
{% if model.tunableop %}
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||
operators to find the fastest one for your hardware.
|
||||
|
||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||
the ``--tunableop on`` argument in your run.
|
||||
|
||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||
performance-collection run.
|
||||
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required scripts
|
||||
|
||||
1. Run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||
as shown in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ unified_docker.pull_tag }}
|
||||
|
||||
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
|
||||
3. To start the benchmark, use the following command with the appropriate options.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./run.sh \
|
||||
--config $CONFIG_CSV \
|
||||
--model_repo {{ model.model_repo }} \
|
||||
<overrides>
|
||||
|
||||
.. dropdown:: Benchmark options
|
||||
:open:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``--config``
|
||||
- ``configs/default.csv``
|
||||
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||
|
||||
* -
|
||||
- ``configs/extended.csv``
|
||||
-
|
||||
|
||||
* -
|
||||
- ``configs/performance.csv``
|
||||
-
|
||||
|
||||
* - ``--benchmark``
|
||||
- ``throughput``
|
||||
- Measure offline end-to-end throughput.
|
||||
|
||||
* -
|
||||
- ``serving``
|
||||
- Measure online serving performance.
|
||||
|
||||
* -
|
||||
- ``all``
|
||||
- Measure both throughput and serving.
|
||||
|
||||
* - `<overrides>`
|
||||
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||
- Additional overrides to the config CSV.
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. note::
|
||||
|
||||
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
Here are some examples of running the benchmark with various options:
|
||||
|
||||
* Throughput benchmark
|
||||
|
||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark throughput
|
||||
|
||||
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||
|
||||
* Serving benchmark
|
||||
|
||||
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block::
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark serving
|
||||
|
||||
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||
|
||||
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/vllm.git
|
||||
|
||||
2. Checkout the specific release commit.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd vllm
|
||||
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||
|
||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||
|
||||
- To learn how to fine-tune LLMs and optimize inference, see
|
||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -16,7 +16,7 @@ vLLM inference performance testing
|
||||
|
||||
.. _vllm-benchmark-unified-docker-715:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
|
||||
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
|
||||
This table lists previous versions of the ROCm vLLM inference Docker image for
|
||||
inference performance testing. For detailed information about available models
|
||||
for benchmarking, see the version-specific documentation. You can find tagged
|
||||
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
|
||||
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
@@ -31,26 +31,30 @@ PyTorch inference performance testing
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1" style="display: none;">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
<div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1" style="display: none;">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
|
||||
@@ -2,19 +2,19 @@
|
||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
************************************
|
||||
SGLang inference performance testing
|
||||
************************************
|
||||
*****************************************************************
|
||||
SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
|
||||
*****************************************************************
|
||||
|
||||
.. _sglang-benchmark-unified-docker:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
|
||||
serving engine for large language models (LLMs) and vision models. The
|
||||
ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
|
||||
ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
|
||||
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
|
||||
accelerators. It includes the following software components:
|
||||
|
||||
@@ -24,14 +24,10 @@ SGLang inference performance testing
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||
- {{ unified_docker.rocm_version }}
|
||||
|
||||
* - `SGLang <https://docs.sglang.ai/index.html>`__
|
||||
- {{ unified_docker.sglang_version }}
|
||||
|
||||
* - `PyTorch <https://github.com/pytorch/pytorch>`__
|
||||
- {{ unified_docker.pytorch_version }}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -50,8 +46,8 @@ system's configuration.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.sglang_benchmark.model_groups %}
|
||||
{% set unified_docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
@@ -7,14 +7,13 @@
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. _vllm-benchmark-unified-docker-812:
|
||||
.. _vllm-benchmark-unified-docker-909:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||
@@ -26,20 +25,13 @@ vLLM inference performance testing
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||
- {{ unified_docker.rocm_version }}
|
||||
|
||||
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||
- {{ unified_docker.vllm_version }}
|
||||
|
||||
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||
- {{ unified_docker.pytorch_version }}
|
||||
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- {{ unified_docker.hipblaslt_version }}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-909>` for
|
||||
MI300X series accelerators.
|
||||
|
||||
What's new
|
||||
@@ -47,21 +39,23 @@ What's new
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||
|
||||
* Upgraded to vLLM v0.10.
|
||||
* Upgraded to vLLM v0.10.1.
|
||||
|
||||
* FP8 KV cache support via AITER.
|
||||
* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
|
||||
|
||||
* Full graph capture support via AITER.
|
||||
* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
|
||||
|
||||
.. _vllm-benchmark-supported-models-909:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-812:
|
||||
.. _vllm-benchmark-available-models-909:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
@@ -70,55 +64,51 @@ Supported models
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-812:
|
||||
.. _vllm-benchmark-vllm-909:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
more information.
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-812:
|
||||
.. _vllm-benchmark-performance-measurements-909:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -151,18 +141,18 @@ system's configuration.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
@@ -170,7 +160,7 @@ system's configuration.
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-812:
|
||||
.. _vllm-benchmark-mad-909:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -181,6 +171,9 @@ system's configuration.
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
@@ -208,7 +201,7 @@ system's configuration.
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||
<vllm-benchmark-available-models-909>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
@@ -232,132 +225,143 @@ system's configuration.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required scripts
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||
|
||||
1. Run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||
as shown in the following snippet.
|
||||
.. seealso::
|
||||
|
||||
For more information on configuration, see the `config files
|
||||
<https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
|
||||
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||
for descriptions of available configuration options
|
||||
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||
additional benchmarking information.
|
||||
|
||||
.. rubric:: Launch the container
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||
in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. rubric:: Throughput command
|
||||
|
||||
Use the following command to start the throughput benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
num_prompts=1024
|
||||
in=128
|
||||
out=128
|
||||
dtype={{ model.config.dtype }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs=1024
|
||||
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm bench throughput --model $model \
|
||||
-tp $tp \
|
||||
--num-prompts $num_prompts \
|
||||
--input-len $in \
|
||||
--output-len $out \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--trust-remote-code \
|
||||
--output-json ${model}_throughput.json \
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
.. rubric:: Serving command
|
||||
|
||||
1. Start the server using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ unified_docker.pull_tag }}
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
dtype={{ model.config.dtype }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs=256
|
||||
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||
vllm serve $model \
|
||||
-tp $tp \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--no-enable-prefix-caching \
|
||||
--swap-space 16 \
|
||||
--disable-log-requests \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
Wait until the model has loaded and the server is ready to accept requests.
|
||||
|
||||
2. On another terminal on the same machine, run the benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/vllm
|
||||
# Connect to the container
|
||||
docker exec -it test bash
|
||||
|
||||
3. To start the benchmark, use the following command with the appropriate options.
|
||||
# Wait for the server to start
|
||||
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||
|
||||
# Run the benchmark
|
||||
model={{ model.model_repo }}
|
||||
max_concurrency=1
|
||||
num_prompts=10
|
||||
in=128
|
||||
out=128
|
||||
vllm bench serve --model $model \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--dataset-name random \
|
||||
--ignore-eos \
|
||||
--max-concurrency $max_concurrency \
|
||||
--num-prompts $num_prompts \
|
||||
--random-input-len $in \
|
||||
--random-output-len $out \
|
||||
--trust-remote-code \
|
||||
--save-result \
|
||||
--result-filename ${model}_serving.json
|
||||
|
||||
.. note::
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
./run.sh \
|
||||
--config $CONFIG_CSV \
|
||||
--model_repo {{ model.model_repo }} \
|
||||
<overrides>
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
.. dropdown:: Benchmark options
|
||||
:open:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:align: center
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``--config``
|
||||
- ``configs/default.csv``
|
||||
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||
|
||||
* -
|
||||
- ``configs/extended.csv``
|
||||
-
|
||||
|
||||
* -
|
||||
- ``configs/performance.csv``
|
||||
-
|
||||
|
||||
* - ``--benchmark``
|
||||
- ``throughput``
|
||||
- Measure offline end-to-end throughput.
|
||||
|
||||
* -
|
||||
- ``serving``
|
||||
- Measure online serving performance.
|
||||
|
||||
* -
|
||||
- ``all``
|
||||
- Measure both throughput and serving.
|
||||
|
||||
* - `<overrides>`
|
||||
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||
- Additional overrides to the config CSV.
|
||||
|
||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||
already configured. You don't need to specify them with this script.
|
||||
|
||||
.. note::
|
||||
|
||||
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
Here are some examples of running the benchmark with various options:
|
||||
|
||||
* Throughput benchmark
|
||||
|
||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark throughput
|
||||
|
||||
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||
|
||||
* Serving benchmark
|
||||
|
||||
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||
|
||||
.. code-block::
|
||||
|
||||
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||
./run.sh \
|
||||
--config configs/default.csv \
|
||||
--model_repo {{model.model_repo}} \
|
||||
--benchmark serving
|
||||
|
||||
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. raw:: html
|
||||
|
||||
@@ -382,7 +386,7 @@ Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||
.. code-block:: shell
|
||||
|
||||
cd vllm
|
||||
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||
git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
|
||||
|
||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
@@ -419,15 +423,12 @@ Further reading
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
a brief introduction to vLLM and optimization strategies.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||
|
||||
- To learn how to fine-tune LLMs and optimize inference, see
|
||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
|
||||
@@ -82,32 +82,32 @@ started.
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -208,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
|
||||
|
||||
e. RDMA interface
|
||||
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
|
||||
Then, set the RDMA interfaces to use for communication.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -70,32 +70,32 @@ workloads:
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ Multi-node setup
|
||||
|
||||
For multi-node environments, ensure you have all the necessary packages for
|
||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
|
||||
|
||||
1. Install the following packages to build and install the RDMA driver.
|
||||
|
||||
@@ -177,7 +177,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
|
||||
e. RDMA interface
|
||||
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
|
||||
Then, set the RDMA interfaces to use for communication.
|
||||
|
||||
.. code-block:: bash
|
||||
@@ -199,7 +199,7 @@ Pull the Docker image
|
||||
docker pull rocm/jax-training:maxtext-v25.5
|
||||
|
||||
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
||||
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
|
||||
used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
|
||||
and execute the benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
@@ -55,32 +55,32 @@ vary by model -- select one to get started.
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
@@ -45,30 +45,30 @@ vary by model -- select one to get started.
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@@ -7,15 +7,14 @@ html {
|
||||
--compat-head-color: var(--pst-color-surface);
|
||||
--compat-param-hover-color: var(--pst-color-link-hover);
|
||||
--compat-param-selected-color: var(--pst-color-primary);
|
||||
--compat-border-color: var(--pst-color-border);
|
||||
}
|
||||
|
||||
html[data-theme="light"] {
|
||||
--compat-border-color: var(--pst-gray-500);
|
||||
--compat-param-disabled-color: var(--pst-gray-300);
|
||||
}
|
||||
|
||||
html[data-theme="dark"] {
|
||||
--compat-border-color: var(--pst-gray-600);
|
||||
--compat-param-disabled-color: var(--pst-gray-600);
|
||||
}
|
||||
|
||||
@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
|
||||
padding: 0 0 1rem 0;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"],
|
||||
div[data-param-k="model"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="selected"],
|
||||
div[data-param-k="model"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model"]:not([data-param-state]):hover {
|
||||
div[data-param-k="model-group"]:hover,
|
||||
div[data-param-k="model"]:hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"] {
|
||||
background-color: var(--compat-bg-color);
|
||||
padding: 2px;
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"][data-param-state="selected"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
}
|
||||
|
||||
/*
|
||||
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||
background-color: var(--compat-param-selected-color);
|
||||
color: var(--compat-fg-color);
|
||||
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||
div[data-param-k="model-group"][data-param-state="disabled"] {
|
||||
background-color: var(--compat-param-disabled-color);
|
||||
text-decoration: line-through;
|
||||
/* text-decoration-color: var(--pst-color-danger); */
|
||||
text-decoration-color: var(--pst-color-danger);
|
||||
cursor: auto;
|
||||
}
|
||||
|
||||
div[data-param-k="model-group"]:not([data-param-state]):hover {
|
||||
background-color: var(--compat-param-hover-color);
|
||||
}
|
||||
*/
|
||||
|
||||
.model-param-head {
|
||||
background-color: var(--compat-head-color);
|
||||
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
||||
/* margin: 2px; */
|
||||
border-right: solid 2px var(--compat-accent-color);
|
||||
border-right: solid 4px var(--compat-accent-color);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.model-param {
|
||||
/* padding: 2px; */
|
||||
/* margin: 0 2px 0 2px; */
|
||||
/* margin: 2px; */
|
||||
border: solid 1px var(--compat-border-color);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user