diff --git a/.azuredevops/components/rocm-examples.yml b/.azuredevops/components/rocm-examples.yml index e87ec204b..8c413b801 100644 --- a/.azuredevops/components/rocm-examples.yml +++ b/.azuredevops/components/rocm-examples.yml @@ -37,8 +37,10 @@ parameters: - llvm-project - rocBLAS - rocFFT + - rocJPEG - rocPRIM - rocprofiler-register + - rocprofiler-sdk - ROCR-Runtime - rocRAND - rocSOLVER @@ -65,7 +67,9 @@ parameters: - rocFFT - rocminfo - rocPRIM + - rocJPEG - rocprofiler-register + - rocprofiler-sdk - ROCR-Runtime - rocRAND - rocSOLVER diff --git a/.wordlist.txt b/.wordlist.txt index b4fb3c655..8e7c9ba62 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -313,6 +313,7 @@ Mooncake Mpops Multicore Multithreaded +MXFP MyEnvironment MyST NANOO @@ -714,6 +715,7 @@ githooks github globals gnupg +gpu grayscale gx gzip @@ -768,6 +770,7 @@ invariants invocating ipo jax +json kdb kfd kv @@ -981,6 +984,7 @@ toolset toolsets torchtitan torchvision +tp tqdm tracebacks txt diff --git a/default.xml b/default.xml index 2f8a78273..19d40cd4f 100644 --- a/default.xml +++ b/default.xml @@ -1,7 +1,7 @@ - @@ -41,7 +41,6 @@ - @@ -57,7 +56,6 @@ - diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml new file mode 100644 index 000000000..99d9b773b --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml @@ -0,0 +1,188 @@ +dockers: + - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c + components: + ROCm: 6.4.1 + vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641) + PyTorch: 2.7.0+gitf717b2a + hipBLASLt: 0.15 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 70B + mad_tag: pyt_vllm_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 4096 + max_num_batched_tokens: 4096 + max_model_len: 4096 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 70B FP8 + mad_tag: pyt_vllm_llama-3.1-70b_fp8 + model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: Mistral AI + tag: mistral + models: + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 65536 + max_num_batched_tokens: 65536 + max_model_len: 8192 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_seq_len_to_capture: 65536 + max_num_batched_tokens: 65536 + max_model_len: 8192 + - group: Qwen + tag: qwen + models: + - model: QwQ-32B + mad_tag: pyt_vllm_qwq-32b + model_repo: Qwen/QwQ-32B + url: https://huggingface.co/Qwen/QwQ-32B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 131072 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Qwen3 30B A3B + mad_tag: pyt_vllm_qwen3-30b-a3b + model_repo: Qwen/Qwen3-30B-A3B + url: https://huggingface.co/Qwen/Qwen3-30B-A3B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 32768 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - group: Microsoft Phi + tag: phi + models: + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_seq_len_to_capture: 16384 + max_num_batched_tokens: 16384 + max_model_len: 8192 diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml index 99d9b773b..b669022de 100644 --- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml @@ -1,188 +1,316 @@ dockers: - - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909 - docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c + - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006 + docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5 components: - ROCm: 6.4.1 - vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641) - PyTorch: 2.7.0+gitf717b2a - hipBLASLt: 0.15 + ROCm: 7.0.0 + vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700) + PyTorch: 2.9.0a0+git1c57644 + hipBLASLt: 1.0.0 + dockerfile: + commit: 790d22168820507f3105fef29596549378cfe399 model_groups: - group: Meta Llama tag: llama models: - - model: Llama 3.1 8B - mad_tag: pyt_vllm_llama-3.1-8b - model_repo: meta-llama/Llama-3.1-8B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-8B - precision: float16 - config: - tp: 1 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Llama 3.1 70B - mad_tag: pyt_vllm_llama-3.1-70b - model_repo: meta-llama/Llama-3.1-70B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct - precision: float16 - config: - tp: 8 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Llama 3.1 405B - mad_tag: pyt_vllm_llama-3.1-405b - model_repo: meta-llama/Llama-3.1-405B-Instruct - url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct - precision: float16 - config: - tp: 8 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Llama 2 70B - mad_tag: pyt_vllm_llama-2-70b - model_repo: meta-llama/Llama-2-70b-chat-hf - url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf - precision: float16 - config: - tp: 8 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 4096 - max_num_batched_tokens: 4096 - max_model_len: 4096 - - model: Llama 3.1 8B FP8 - mad_tag: pyt_vllm_llama-3.1-8b_fp8 - model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV - precision: float8 - config: - tp: 1 - dtype: auto - kv_cache_dtype: fp8 - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Llama 3.1 70B FP8 - mad_tag: pyt_vllm_llama-3.1-70b_fp8 - model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV - precision: float8 - config: - tp: 8 - dtype: auto - kv_cache_dtype: fp8 - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Llama 3.1 405B FP8 - mad_tag: pyt_vllm_llama-3.1-405b_fp8 - model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV - url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV - precision: float8 - config: - tp: 8 - dtype: auto - kv_cache_dtype: fp8 - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 + - model: Llama 2 70B + mad_tag: pyt_vllm_llama-2-70b + model_repo: meta-llama/Llama-2-70b-chat-hf + url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 4096 + max_model_len: 4096 + - model: Llama 3.1 8B + mad_tag: pyt_vllm_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 8B FP8 + mad_tag: pyt_vllm_llama-3.1-8b_fp8 + model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV + precision: float8 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B + mad_tag: pyt_vllm_llama-3.1-405b + model_repo: meta-llama/Llama-3.1-405B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B FP8 + mad_tag: pyt_vllm_llama-3.1-405b_fp8 + model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.1 405B MXFP4 + mad_tag: pyt_vllm_llama-3.1-405b_fp4 + model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview + url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview + precision: float4 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B + mad_tag: pyt_vllm_llama-3.3-70b + model_repo: meta-llama/Llama-3.3-70B-Instruct + url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B FP8 + mad_tag: pyt_vllm_llama-3.3-70b_fp8 + model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV + url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 3.3 70B MXFP4 + mad_tag: pyt_vllm_llama-3.3-70b_fp4 + model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview + url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview + precision: float4 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - model: Llama 4 Scout 17Bx16E + mad_tag: pyt_vllm_llama-4-scout-17b-16e + model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct + url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Llama 4 Maverick 17Bx128E + mad_tag: pyt_vllm_llama-4-maverick-17b-128e + model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct + url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Llama 4 Maverick 17Bx128E FP8 + mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8 + model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek R1 0528 FP8 + mad_tag: pyt_vllm_deepseek-r1 + model_repo: deepseek-ai/DeepSeek-R1-0528 + url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_seqs: 1024 + max_num_batched_tokens: 131072 + max_model_len: 8192 + - group: OpenAI GPT OSS + tag: gpt-oss + models: + - model: GPT OSS 20B + mad_tag: pyt_vllm_gpt-oss-20b + model_repo: openai/gpt-oss-20b + url: https://huggingface.co/openai/gpt-oss-20b + precision: bfloat16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 8192 + max_model_len: 8192 + - model: GPT OSS 120B + mad_tag: pyt_vllm_gpt-oss-120b + model_repo: openai/gpt-oss-120b + url: https://huggingface.co/openai/gpt-oss-120b + precision: bfloat16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 8192 + max_model_len: 8192 - group: Mistral AI tag: mistral models: - - model: Mixtral MoE 8x7B - mad_tag: pyt_vllm_mixtral-8x7b - model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 - url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 - precision: float16 - config: - tp: 8 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 32768 - max_num_batched_tokens: 32768 - max_model_len: 8192 - - model: Mixtral MoE 8x22B - mad_tag: pyt_vllm_mixtral-8x22b - model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 - url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 - precision: float16 - config: - tp: 8 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 65536 - max_num_batched_tokens: 65536 - max_model_len: 8192 - - model: Mixtral MoE 8x7B FP8 - mad_tag: pyt_vllm_mixtral-8x7b_fp8 - model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV - url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV - precision: float8 - config: - tp: 8 - dtype: auto - kv_cache_dtype: fp8 - max_seq_len_to_capture: 32768 - max_num_batched_tokens: 32768 - max_model_len: 8192 - - model: Mixtral MoE 8x22B FP8 - mad_tag: pyt_vllm_mixtral-8x22b_fp8 - model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV - url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV - precision: float8 - config: - tp: 8 - dtype: auto - kv_cache_dtype: fp8 - max_seq_len_to_capture: 65536 - max_num_batched_tokens: 65536 - max_model_len: 8192 + - model: Mixtral MoE 8x7B + mad_tag: pyt_vllm_mixtral-8x7b + model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x7B FP8 + mad_tag: pyt_vllm_mixtral-8x7b_fp8 + model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 32768 + max_model_len: 8192 + - model: Mixtral MoE 8x22B + mad_tag: pyt_vllm_mixtral-8x22b + model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1 + url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1 + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 65536 + max_model_len: 8192 + - model: Mixtral MoE 8x22B FP8 + mad_tag: pyt_vllm_mixtral-8x22b_fp8 + model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 65536 + max_model_len: 8192 - group: Qwen tag: qwen models: - - model: QwQ-32B - mad_tag: pyt_vllm_qwq-32b - model_repo: Qwen/QwQ-32B - url: https://huggingface.co/Qwen/QwQ-32B - precision: float16 - config: - tp: 1 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 131072 - max_num_batched_tokens: 131072 - max_model_len: 8192 - - model: Qwen3 30B A3B - mad_tag: pyt_vllm_qwen3-30b-a3b - model_repo: Qwen/Qwen3-30B-A3B - url: https://huggingface.co/Qwen/Qwen3-30B-A3B - precision: float16 - config: - tp: 1 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 32768 - max_num_batched_tokens: 32768 - max_model_len: 8192 + - model: Qwen3 8B + mad_tag: pyt_vllm_qwen3-8b + model_repo: Qwen/Qwen3-8B + url: https://huggingface.co/Qwen/Qwen3-8B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 32B + mad_tag: pyt_vllm_qwen3-32b + model_repo: Qwen/Qwen3-32b + url: https://huggingface.co/Qwen/Qwen3-32B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 30B A3B + mad_tag: pyt_vllm_qwen3-30b-a3b + model_repo: Qwen/Qwen3-30B-A3B + url: https://huggingface.co/Qwen/Qwen3-30B-A3B + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 30B A3B FP8 + mad_tag: pyt_vllm_qwen3-30b-a3b_fp8 + model_repo: Qwen/Qwen3-30B-A3B-FP8 + url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8 + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 235B A22B + mad_tag: pyt_vllm_qwen3-235b-a22b + model_repo: Qwen/Qwen3-235B-A22B + url: https://huggingface.co/Qwen/Qwen3-235B-A22B + precision: float16 + config: + tp: 8 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 40960 + max_model_len: 8192 + - model: Qwen3 235B A22B FP8 + mad_tag: pyt_vllm_qwen3-235b-a22b_fp8 + model_repo: Qwen/Qwen3-235B-A22B-FP8 + url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8 + precision: float8 + config: + tp: 8 + dtype: auto + kv_cache_dtype: fp8 + max_num_batched_tokens: 40960 + max_model_len: 8192 - group: Microsoft Phi tag: phi models: - - model: Phi-4 - mad_tag: pyt_vllm_phi-4 - model_repo: microsoft/phi-4 - url: https://huggingface.co/microsoft/phi-4 - config: - tp: 1 - dtype: auto - kv_cache_dtype: auto - max_seq_len_to_capture: 16384 - max_num_batched_tokens: 16384 - max_model_len: 8192 + - model: Phi-4 + mad_tag: pyt_vllm_phi-4 + model_repo: microsoft/phi-4 + url: https://huggingface.co/microsoft/phi-4 + precision: float16 + config: + tp: 1 + dtype: auto + kv_cache_dtype: auto + max_num_batched_tokens: 16384 + max_model_len: 8192 diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst new file mode 100644 index 000000000..a68618338 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst @@ -0,0 +1,448 @@ +:orphan: + +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image. + :keywords: model, MAD, automation, dashboarding, validate + +********************************** +vLLM inference performance testing +********************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See :doc:`../vllm` for the latest version. + +.. _vllm-benchmark-unified-docker-909: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + + The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers + a prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM + Docker image integrates vLLM and PyTorch tailored specifically for MI300X series + accelerators and includes the following components: + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +With this Docker image, you can quickly test the :ref:`expected +inference performance numbers ` for +MI300X series accelerators. + +What's new +========== + +The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. + +* Upgraded to vLLM v0.10.1. + +* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance. + +* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile. + +.. _vllm-benchmark-supported-models-909: + +Supported models +================ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} + + .. _vllm-benchmark-available-models-909: + + The following models are supported for inference performance benchmarking + with vLLM and ROCm. Some instructions, commands, and recommendations in this + documentation might vary by model -- select one to get started. + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. _vllm-benchmark-vllm-909: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization prior to use via an external license agreement through a third party. + {% if model.precision == "float8" and model.model_repo.startswith("amd") %} + This model uses FP8 quantization via `AMD Quark `__ for efficient inference on AMD accelerators. + {% endif %} + + {% endfor %} + {% endfor %} + +.. _vllm-benchmark-performance-measurements-909: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and serving measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml + + {% set docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} + + Pull the Docker image + ===================== + + Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_. + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + Benchmarking + ============ + + Once the setup is complete, choose between two options to reproduce the + benchmark results: + + .. _vllm-benchmark-mad-909: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + The following run command is tailored to {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model + using one GPU with the :literal:`{{model.precision}}` data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + Although the :ref:`available models + ` are preconfigured to collect + offline throughput and online serving performance data, you can + also change the benchmarking parameters. See the standalone + benchmarking tab for more information. + + {% if model.tunableop %} + + .. note:: + + For improved performance, consider enabling :ref:`PyTorch TunableOp `. + TunableOp automatically explores different implementations and configurations of certain PyTorch + operators to find the fastest one for your hardware. + + By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see + ``__). To enable it, include + the ``--tunableop on`` argument in your run. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the + performance-collection run. + + {% endif %} + + .. tab-item:: Standalone benchmarking + + The following commands are optimized for {{ model.model }}. + See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. + + .. seealso:: + + For more information on configuration, see the `config files + `__ + in the MAD repository. Refer to the `vLLM engine `__ + for descriptions of available configuration options + and `Benchmarking vLLM `__ for + additional benchmarking information. + + .. rubric:: Launch the container + + You can run the vLLM benchmark tool independently by starting the + `Docker container <{{ docker.docker_hub_url }}>`_ as shown + in the following snippet. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + docker run -it \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --shm-size 16G \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --cap-add=SYS_PTRACE \ + -v $(pwd):/workspace \ + --env HUGGINGFACE_HUB_CACHE=/workspace \ + --name test \ + {{ docker.pull_tag }} + + .. rubric:: Throughput command + + Use the following command to start the throughput benchmark. + + .. code-block:: shell + + model={{ model.model_repo }} + tp={{ model.config.tp }} + num_prompts=1024 + in=128 + out=128 + dtype={{ model.config.dtype }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs=1024 + max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} + + vllm bench throughput --model $model \ + -tp $tp \ + --num-prompts $num_prompts \ + --input-len $in \ + --output-len $out \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-seq-len-to-capture $max_seq_len_to_capture \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --trust-remote-code \ + --output-json ${model}_throughput.json \ + --gpu-memory-utilization 0.9 + + .. rubric:: Serving command + + 1. Start the server using the following command: + + .. code-block:: shell + + model={{ model.model_repo }} + tp={{ model.config.tp }} + dtype={{ model.config.dtype }} + kv_cache_dtype={{ model.config.kv_cache_dtype }} + max_num_seqs=256 + max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} + max_num_batched_tokens={{ model.config.max_num_batched_tokens }} + max_model_len={{ model.config.max_model_len }} + + vllm serve $model \ + -tp $tp \ + --dtype $dtype \ + --kv-cache-dtype $kv_cache_dtype \ + --max-num-seqs $max_num_seqs \ + --max-seq-len-to-capture $max_seq_len_to_capture \ + --max-num-batched-tokens $max_num_batched_tokens \ + --max-model-len $max_model_len \ + --no-enable-prefix-caching \ + --swap-space 16 \ + --disable-log-requests \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 + + Wait until the model has loaded and the server is ready to accept requests. + + 2. On another terminal on the same machine, run the benchmark: + + .. code-block:: shell + + # Connect to the container + docker exec -it test bash + + # Wait for the server to start + until curl -s http://localhost:8000/v1/models; do sleep 30; done + + # Run the benchmark + model={{ model.model_repo }} + max_concurrency=1 + num_prompts=10 + in=128 + out=128 + vllm bench serve --model $model \ + --percentile-metrics "ttft,tpot,itl,e2el" \ + --dataset-name random \ + --ignore-eos \ + --max-concurrency $max_concurrency \ + --num-prompts $num_prompts \ + --random-input-len $in \ + --random-output-len $out \ + --trust-remote-code \ + --save-result \ + --result-filename ${model}_serving.json + + .. note:: + + For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B, + try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands. + + If you encounter the following error, pass your access-authorized Hugging + Face token to the gated models. + + .. code-block:: + + OSError: You are trying to access a gated repo. + + # pass your HF_TOKEN + export HF_TOKEN=$your_personal_hf_token + + .. raw:: html + + + + .. note:: + + Throughput is calculated as: + + - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time + + - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time + {% endfor %} + {% endfor %} + +Advanced usage +============== + +For information on experimental features and known issues related to ROCm optimization efforts on vLLM, +see the developer's guide at ``__. + +Reproducing the Docker image +---------------------------- + +To reproduce this ROCm/vLLM Docker image release, follow these steps: + +1. Clone the `vLLM repository `__. + + .. code-block:: shell + + git clone https://github.com/ROCm/vllm.git + +2. Checkout the specific release commit. + + .. code-block:: shell + + cd vllm + git checkout 6663000a391911eba96d7864a26ac42b07f6ef29 + +3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. + + .. code-block:: shell + + docker build -f docker/Dockerfile.rocm -t vllm-rocm . + +Further reading +=============== + +- To learn more about the options for latency and throughput benchmark scripts, + see ``_. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_. + +- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for + a brief introduction to vLLM and optimization strategies. + +- For application performance optimization strategies for HPC and AI workloads, + including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`vllm-history` to find documentation for previous releases +of the ``ROCm/vllm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index 2fbd21002..274492147 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -7,7 +7,7 @@ vLLM inference performance testing version history This table lists previous versions of the ROCm vLLM inference Docker image for inference performance testing. For detailed information about available models for benchmarking, see the version-specific documentation. You can find tagged -previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub `__. +previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub `__. .. list-table:: :header-rows: 1 @@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub ` + * `Docker Hub `__ + + * - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909`` - * ROCm 6.4.1 * vLLM 0.10.1 * PyTorch 2.7.0 - - * :doc:`Documentation <../vllm>` + * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812`` diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 38a5f8200..66e7f6621 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -6,45 +6,63 @@ vLLM inference performance testing ********************************** -.. _vllm-benchmark-unified-docker-909: +.. _vllm-benchmark-unified-docker-930: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml {% set docker = data.dockers[0] %} - The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers - a prebuilt, optimized environment for validating large language model (LLM) - inference performance on AMD Instinctâ„¢ MI300X series GPUs. This ROCm vLLM - Docker image integrates vLLM and PyTorch tailored specifically for MI300X series - GPUs and includes the following components: + The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a + prebuilt, optimized environment for validating large language model (LLM) + inference performance on AMD Instinctâ„¢ MI355X, MI350X, MI325X and MI300X + GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored + specifically for AMD data center GPUs and includes the following components: - .. list-table:: - :header-rows: 1 + .. tab-set:: - * - Software component - - Version + .. tab-item:: {{ docker.pull_tag }} - {% for component_name, component_version in docker.components.items() %} - * - {{ component_name }} - - {{ component_version }} - {% endfor %} + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} With this Docker image, you can quickly test the :ref:`expected -inference performance numbers ` for -MI300X series GPUs. +inference performance numbers ` for +AMD Instinct GPUs. What's new ========== The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release `. -* Upgraded to vLLM v0.10.1. +* Added support for AMD Instinct MI355X and MI350X GPUs. -* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance. +* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`. -* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile. + * Llama 4 Scout and Maverick -.. _vllm-benchmark-supported-models-909: + * DeepSeek R1 0528 FP8 + + * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4 + + * GPT OSS 20B and 120B + + * Qwen 3 32B, 30B-A3B, and 235B-A22B + +* Removed the deprecated ``--max-seq-len-to-capture`` flag. + +* ``--gpu-memory-utilization`` is now configurable via the `configuration files + `__ in the MAD + repository. + +.. _vllm-benchmark-supported-models-930: Supported models ================ @@ -54,11 +72,12 @@ Supported models {% set docker = data.dockers[0] %} {% set model_groups = data.model_groups %} - .. _vllm-benchmark-available-models-909: + .. _vllm-benchmark-available-models-930: The following models are supported for inference performance benchmarking with vLLM and ROCm. Some instructions, commands, and recommendations in this - documentation might vary by model -- select one to get started. + documentation might vary by model -- select one to get started. MXFP4 models + are only supported on MI355X and MI350X GPUs. .. raw:: html @@ -67,7 +86,7 @@ Supported models
Model
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %}
@@ -89,13 +108,20 @@ Supported models - .. _vllm-benchmark-vllm-909: + .. _vllm-benchmark-vllm-930: {% for model_group in model_groups %} {% for model in model_group.models %} .. container:: model-doc {{ model.mad_tag }} + + {% if model.precision == "float4" %} + .. important:: + + MXFP4 is supported only on MI355X and MI350X GPUs. + {% endif %} + .. note:: See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. @@ -103,11 +129,14 @@ Supported models {% if model.precision == "float8" and model.model_repo.startswith("amd") %} This model uses FP8 quantization via `AMD Quark `__ for efficient inference on AMD GPUs. {% endif %} + {% if model.precision == "float4" and model.model_repo.startswith("amd") %} + This model uses FP4 quantization via `AMD Quark `__ for efficient inference on AMD GPUs. + {% endif %} {% endfor %} {% endfor %} -.. _vllm-benchmark-performance-measurements-909: +.. _vllm-benchmark-performance-measurements-930: Performance measurements ======================== @@ -121,7 +150,7 @@ page provides reference throughput and serving measurements for inferencing popu The performance data presented in `Performance results with AMD ROCm software `_ only reflects the latest version of this inference benchmarking environment. - The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software. System validation ================= @@ -163,7 +192,7 @@ Benchmarking Once the setup is complete, choose between two options to reproduce the benchmark results: - .. _vllm-benchmark-mad-909: + .. _vllm-benchmark-mad-930: {% for model_group in model_groups %} {% for model in model_group.models %} @@ -175,7 +204,7 @@ Benchmarking .. tab-item:: MAD-integrated benchmarking The following run command is tailored to {{ model.model }}. - See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. + See :ref:`vllm-benchmark-supported-models-930` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -186,8 +215,9 @@ Benchmarking cd MAD pip install -r requirements.txt - 2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model - using one GPU with the :literal:`{{model.precision}}` data type on the host machine. + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node with the + :literal:`{{model.precision}}` data type. .. code-block:: shell @@ -195,8 +225,7 @@ Benchmarking madengine run \ --tags {{model.mad_tag}} \ --keep-model-dir \ - --live-output \ - --timeout 28800 + --live-output MAD launches a Docker container with the name ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the @@ -204,7 +233,7 @@ Benchmarking and ``{{ model.mad_tag }}_serving.csv``. Although the :ref:`available models - ` are preconfigured to collect + ` are preconfigured to collect offline throughput and online serving performance data, you can also change the benchmarking parameters. See the standalone benchmarking tab for more information. @@ -229,7 +258,7 @@ Benchmarking .. tab-item:: Standalone benchmarking The following commands are optimized for {{ model.model }}. - See :ref:`vllm-benchmark-supported-models-909` to switch to another available model. + See :ref:`vllm-benchmark-supported-models-930` to switch to another available model. .. seealso:: @@ -270,13 +299,12 @@ Benchmarking model={{ model.model_repo }} tp={{ model.config.tp }} - num_prompts=1024 - in=128 - out=128 - dtype={{ model.config.dtype }} + num_prompts={{ model.config.num_prompts | default(1024) }} + in={{ model.config.in | default(128) }} + out={{ model.config.in | default(128) }} + dtype={{ model.config.dtype | default("auto") }} kv_cache_dtype={{ model.config.kv_cache_dtype }} - max_num_seqs=1024 - max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} + max_num_seqs={{ model.config.max_num_seqs | default(1024) }} max_num_batched_tokens={{ model.config.max_num_batched_tokens }} max_model_len={{ model.config.max_model_len }} @@ -288,12 +316,11 @@ Benchmarking --dtype $dtype \ --kv-cache-dtype $kv_cache_dtype \ --max-num-seqs $max_num_seqs \ - --max-seq-len-to-capture $max_seq_len_to_capture \ --max-num-batched-tokens $max_num_batched_tokens \ --max-model-len $max_model_len \ --trust-remote-code \ --output-json ${model}_throughput.json \ - --gpu-memory-utilization 0.9 + --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }} .. rubric:: Serving command @@ -306,7 +333,6 @@ Benchmarking dtype={{ model.config.dtype }} kv_cache_dtype={{ model.config.kv_cache_dtype }} max_num_seqs=256 - max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }} max_num_batched_tokens={{ model.config.max_num_batched_tokens }} max_model_len={{ model.config.max_model_len }} @@ -315,7 +341,6 @@ Benchmarking --dtype $dtype \ --kv-cache-dtype $kv_cache_dtype \ --max-num-seqs $max_num_seqs \ - --max-seq-len-to-capture $max_seq_len_to_capture \ --max-num-batched-tokens $max_num_batched_tokens \ --max-model-len $max_model_len \ --no-enable-prefix-caching \ @@ -397,26 +422,31 @@ see the developer's guide at ``__. - - .. code-block:: shell - - git clone https://github.com/ROCm/vllm.git - -2. Checkout the specific release commit. +1. Clone the `vLLM repository `__. .. code-block:: shell + git clone https://github.com/vllm-project/vllm.git cd vllm - git checkout 6663000a391911eba96d7864a26ac42b07f6ef29 -3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag. +2. Use the following command to build the image directly from the specified commit. - .. code-block:: shell + .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml - docker build -f docker/Dockerfile.rocm -t vllm-rocm . + {% set docker = data.dockers[0] %} + .. code-block:: shell + + docker build -f docker/Dockerfile.rocm \ + --build-arg REMOTE_VLLM=1 \ + --build-arg VLLM_REPO=https://github.com/ROCm/vllm \ + --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \ + -t vllm-rocm . + + .. tip:: + + Replace ``vllm-rocm`` with your desired image tag. Further reading =============== diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 96855ee72..713f8e931 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,4 +1,4 @@ -rocm-docs-core==1.20.1 +rocm-docs-core==1.26.0 sphinx-reredirects sphinx-sitemap sphinxcontrib.datatemplates==0.11.0 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index bd30d7406..3b8d22771 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile requirements.in +# pip-compile docs/sphinx/requirements.in # accessible-pygments==0.0.5 # via pydata-sphinx-theme @@ -10,7 +10,7 @@ alabaster==1.0.0 # via sphinx asttokens==3.0.0 # via stack-data -attrs==25.3.0 +attrs==25.4.0 # via # jsonschema # jupyter-cache @@ -19,34 +19,32 @@ babel==2.17.0 # via # pydata-sphinx-theme # sphinx -beautifulsoup4==4.13.4 +beautifulsoup4==4.14.2 # via pydata-sphinx-theme breathe==4.36.0 # via rocm-docs-core -certifi==2025.4.26 +certifi==2025.10.5 # via requests -cffi==1.17.1 +cffi==2.0.0 # via # cryptography # pynacl -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 # via requests -click==8.2.1 +click==8.3.0 # via # jupyter-cache # sphinx-external-toc -comm==0.2.2 +comm==0.2.3 # via ipykernel -cryptography==45.0.3 +cryptography==46.0.2 # via pyjwt -debugpy==1.8.14 +debugpy==1.8.17 # via ipykernel decorator==5.2.1 # via ipython defusedxml==0.7.1 # via sphinxcontrib-datatemplates -deprecated==1.2.18 - # via pygithub docutils==0.21.2 # via # myst-parser @@ -54,17 +52,17 @@ docutils==0.21.2 # sphinx exceptiongroup==1.3.0 # via ipython -executing==2.2.0 +executing==2.2.1 # via stack-data -fastjsonschema==2.21.1 +fastjsonschema==2.21.2 # via # nbformat # rocm-docs-core gitdb==4.0.12 # via gitpython -gitpython==3.1.44 +gitpython==3.1.45 # via rocm-docs-core -greenlet==3.2.3 +greenlet==3.2.4 # via sqlalchemy idna==3.10 # via requests @@ -74,7 +72,7 @@ importlib-metadata==8.7.0 # via # jupyter-cache # myst-nb -ipykernel==6.29.5 +ipykernel==6.30.1 # via myst-nb ipython==8.37.0 # via @@ -86,9 +84,9 @@ jinja2==3.1.6 # via # myst-parser # sphinx -jsonschema==4.24.0 +jsonschema==4.25.1 # via nbformat -jsonschema-specifications==2025.4.1 +jsonschema-specifications==2025.9.1 # via jsonschema jupyter-cache==1.0.1 # via myst-nb @@ -106,17 +104,17 @@ markdown-it-py==3.0.0 # via # mdit-py-plugins # myst-parser -markupsafe==3.0.2 +markupsafe==3.0.3 # via jinja2 matplotlib-inline==0.1.7 # via # ipykernel # ipython -mdit-py-plugins==0.4.2 +mdit-py-plugins==0.5.0 # via myst-parser mdurl==0.1.2 # via markdown-it-py -myst-nb==1.2.0 +myst-nb==1.3.0 # via rocm-docs-core myst-parser==4.0.1 # via myst-nb @@ -134,31 +132,30 @@ nest-asyncio==1.6.0 packaging==25.0 # via # ipykernel - # pydata-sphinx-theme # sphinx -parso==0.8.4 +parso==0.8.5 # via jedi pexpect==4.9.0 # via ipython -platformdirs==4.3.8 +platformdirs==4.4.0 # via jupyter-core -prompt-toolkit==3.0.51 +prompt-toolkit==3.0.52 # via ipython -psutil==7.0.0 +psutil==7.1.0 # via ipykernel ptyprocess==0.7.0 # via pexpect pure-eval==0.2.3 # via stack-data -pycparser==2.22 +pycparser==2.23 # via cffi -pydata-sphinx-theme==0.15.4 +pydata-sphinx-theme==0.16.1 # via # rocm-docs-core # sphinx-book-theme -pygithub==2.6.1 +pygithub==2.8.1 # via rocm-docs-core -pygments==2.19.1 +pygments==2.19.2 # via # accessible-pygments # ipython @@ -166,11 +163,11 @@ pygments==2.19.1 # sphinx pyjwt[crypto]==2.10.1 # via pygithub -pynacl==1.5.0 +pynacl==1.6.0 # via pygithub python-dateutil==2.9.0.post0 # via jupyter-client -pyyaml==6.0.2 +pyyaml==6.0.3 # via # jupyter-cache # myst-nb @@ -178,7 +175,7 @@ pyyaml==6.0.2 # rocm-docs-core # sphinx-external-toc # sphinxcontrib-datatemplates -pyzmq==26.4.0 +pyzmq==27.1.0 # via # ipykernel # jupyter-client @@ -186,13 +183,13 @@ referencing==0.36.2 # via # jsonschema # jsonschema-specifications -requests==2.32.4 +requests==2.32.5 # via # pygithub # sphinx -rocm-docs-core==1.20.1 - # via -r requirements.in -rpds-py==0.25.1 +rocm-docs-core==1.26.0 + # via -r docs/sphinx/requirements.in +rpds-py==0.27.1 # via # jsonschema # referencing @@ -202,7 +199,7 @@ smmap==5.0.2 # via gitdb snowballstemmer==3.0.1 # via sphinx -soupsieve==2.7 +soupsieve==2.8 # via beautifulsoup4 sphinx==8.1.3 # via @@ -220,7 +217,7 @@ sphinx==8.1.3 # sphinx-reredirects # sphinxcontrib-datatemplates # sphinxcontrib-runcmd -sphinx-book-theme==1.1.4 +sphinx-book-theme==1.1.3 # via rocm-docs-core sphinx-copybutton==0.5.2 # via rocm-docs-core @@ -233,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8 sphinx-notfound-page==1.1.0 # via rocm-docs-core sphinx-reredirects==0.1.6 - # via -r requirements.in -sphinx-sitemap==2.8.0 - # via -r requirements.in + # via -r docs/sphinx/requirements.in +sphinx-sitemap==2.9.0 + # via -r docs/sphinx/requirements.in sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-datatemplates==0.11.0 - # via -r requirements.in + # via -r docs/sphinx/requirements.in sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-htmlhelp==2.1.0 @@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0 # via sphinxcontrib-datatemplates sphinxcontrib-serializinghtml==2.0.0 # via sphinx -sqlalchemy==2.0.41 +sqlalchemy==2.0.43 # via jupyter-cache stack-data==0.6.3 # via ipython @@ -260,13 +257,12 @@ tabulate==0.9.0 # via jupyter-cache tomli==2.2.1 # via sphinx -tornado==6.5.1 +tornado==6.5.2 # via # ipykernel # jupyter-client traitlets==5.14.3 # via - # comm # ipykernel # ipython # jupyter-client @@ -274,9 +270,10 @@ traitlets==5.14.3 # matplotlib-inline # nbclient # nbformat -typing-extensions==4.14.0 +typing-extensions==4.15.0 # via # beautifulsoup4 + # cryptography # exceptiongroup # ipython # myst-nb @@ -288,9 +285,7 @@ urllib3==2.5.0 # via # pygithub # requests -wcwidth==0.2.13 +wcwidth==0.2.14 # via prompt-toolkit -wrapt==1.17.2 - # via deprecated zipp==3.23.0 # via importlib-metadata diff --git a/tools/rocm-build/rocm-7.0.2.xml b/tools/rocm-build/rocm-7.0.2.xml new file mode 100644 index 000000000..8ad54e0f0 --- /dev/null +++ b/tools/rocm-build/rocm-7.0.2.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +