mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-11 07:38:17 -05:00
Compare commits
5 Commits
add-spdlog
...
pdevelop
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
17d04124c1 | ||
|
|
4b8fdf1ae3 | ||
|
|
c88f3996dc | ||
|
|
5b53802c54 | ||
|
|
bdeef73263 |
@@ -189,9 +189,9 @@ jobs:
|
|||||||
-DMIGRAPHX_ENABLE_C_API_TEST=ON
|
-DMIGRAPHX_ENABLE_C_API_TEST=ON
|
||||||
..
|
..
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
- task: Bash@3
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
displayName: Build and run MIGraphX tests
|
parameters:
|
||||||
inputs:
|
componentName: AMDMIGraphX
|
||||||
targetType: inline
|
testExecutable: make
|
||||||
workingDirectory: build
|
testParameters: -j$(nproc) check
|
||||||
script: make -j$(nproc) check
|
testPublishResults: false
|
||||||
|
|||||||
@@ -107,6 +107,7 @@ jobs:
|
|||||||
runRocminfo: false
|
runRocminfo: false
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Build kfdtest
|
displayName: Build kfdtest
|
||||||
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
targetType: 'inline'
|
targetType: 'inline'
|
||||||
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
|
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
|
||||||
@@ -122,6 +123,7 @@ jobs:
|
|||||||
testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
|
testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Build rdmatest app
|
displayName: Build rdmatest app
|
||||||
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
targetType: 'inline'
|
targetType: 'inline'
|
||||||
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/app
|
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/app
|
||||||
@@ -130,6 +132,7 @@ jobs:
|
|||||||
cmake --build .
|
cmake --build .
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Build rdmatest driver
|
displayName: Build rdmatest driver
|
||||||
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
targetType: 'inline'
|
targetType: 'inline'
|
||||||
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
|
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
|
||||||
@@ -139,6 +142,7 @@ jobs:
|
|||||||
RDMA_HEADER_DIR=/usr/src/amdgpu-*/include make all
|
RDMA_HEADER_DIR=/usr/src/amdgpu-*/include make all
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Install rdmatest driver
|
displayName: Install rdmatest driver
|
||||||
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
targetType: 'inline'
|
targetType: 'inline'
|
||||||
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
|
workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
|
||||||
@@ -154,6 +158,7 @@ jobs:
|
|||||||
testPublishResults: false
|
testPublishResults: false
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Build rocrtst
|
displayName: Build rocrtst
|
||||||
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
targetType: 'inline'
|
targetType: 'inline'
|
||||||
workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
|
workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ parameters:
|
|||||||
- libyaml-cpp-dev
|
- libyaml-cpp-dev
|
||||||
- libpci-dev
|
- libpci-dev
|
||||||
- libpci3
|
- libpci3
|
||||||
|
- libgst-dev
|
||||||
- libgtest-dev
|
- libgtest-dev
|
||||||
- git
|
- git
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
|
|||||||
@@ -105,6 +105,12 @@ jobs:
|
|||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||||
|
parameters:
|
||||||
|
${{ if eq(parameters.checkoutRef, '') }}:
|
||||||
|
dependencySource: staging
|
||||||
|
${{ elseif ne(parameters.checkoutRef, '') }}:
|
||||||
|
dependencySource: tag-builds
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ parameters:
|
|||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
|
- aomp
|
||||||
- clr
|
- clr
|
||||||
- llvm-project
|
- llvm-project
|
||||||
- rccl
|
- rccl
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ Bluefield
|
|||||||
Bootloader
|
Bootloader
|
||||||
CCD
|
CCD
|
||||||
CDNA
|
CDNA
|
||||||
|
CHTML
|
||||||
CIFAR
|
CIFAR
|
||||||
CLI
|
CLI
|
||||||
CLion
|
CLion
|
||||||
@@ -70,6 +71,7 @@ Concretized
|
|||||||
Conda
|
Conda
|
||||||
ConnectX
|
ConnectX
|
||||||
CuPy
|
CuPy
|
||||||
|
Dashboarding
|
||||||
DDR
|
DDR
|
||||||
DF
|
DF
|
||||||
DGEMM
|
DGEMM
|
||||||
@@ -227,6 +229,7 @@ Mellanox's
|
|||||||
Meta's
|
Meta's
|
||||||
Miniconda
|
Miniconda
|
||||||
MirroredStrategy
|
MirroredStrategy
|
||||||
|
Mixtral
|
||||||
Multicore
|
Multicore
|
||||||
Multithreaded
|
Multithreaded
|
||||||
MyEnvironment
|
MyEnvironment
|
||||||
@@ -294,6 +297,7 @@ PowerShell
|
|||||||
PyPi
|
PyPi
|
||||||
PyTorch
|
PyTorch
|
||||||
Qcycles
|
Qcycles
|
||||||
|
Qwen
|
||||||
RAII
|
RAII
|
||||||
RAS
|
RAS
|
||||||
RCCL
|
RCCL
|
||||||
@@ -563,6 +567,7 @@ hipfort
|
|||||||
hipify
|
hipify
|
||||||
hipsolver
|
hipsolver
|
||||||
hipsparse
|
hipsparse
|
||||||
|
hlist
|
||||||
hotspotting
|
hotspotting
|
||||||
hpc
|
hpc
|
||||||
hpp
|
hpp
|
||||||
@@ -586,6 +591,7 @@ intra
|
|||||||
invariants
|
invariants
|
||||||
invocating
|
invocating
|
||||||
ipo
|
ipo
|
||||||
|
jax
|
||||||
kdb
|
kdb
|
||||||
kfd
|
kfd
|
||||||
latencies
|
latencies
|
||||||
@@ -606,6 +612,7 @@ migraphx
|
|||||||
miopen
|
miopen
|
||||||
miopengemm
|
miopengemm
|
||||||
mivisionx
|
mivisionx
|
||||||
|
mjx
|
||||||
mkdir
|
mkdir
|
||||||
mlirmiopen
|
mlirmiopen
|
||||||
mtypes
|
mtypes
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ article_pages = [
|
|||||||
"file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
|
"file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
},
|
},
|
||||||
|
{"file": "how-to/performance-validation/mi300x/vllm-benchmark", "os": ["linux"]},
|
||||||
{"file": "how-to/system-optimization/index", "os": ["linux"]},
|
{"file": "how-to/system-optimization/index", "os": ["linux"]},
|
||||||
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
||||||
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ This section discusses how to implement `vLLM <https://docs.vllm.ai/en/latest>`_
|
|||||||
vLLM inference
|
vLLM inference
|
||||||
==============
|
==============
|
||||||
|
|
||||||
vLLM is renowned for its paged attention algorithm that can reduce memory consumption and increase throughput thanks to
|
vLLM is renowned for its PagedAttention algorithm that can reduce memory consumption and increase throughput thanks to
|
||||||
its paging scheme. Instead of allocating GPU high-bandwidth memory (HBM) for the maximum output token lengths of the
|
its paging scheme. Instead of allocating GPU high-bandwidth memory (HBM) for the maximum output token lengths of the
|
||||||
models, the paged attention of vLLM allocates GPU HBM dynamically for its actual decoding lengths. This paged attention
|
models, the paged attention of vLLM allocates GPU HBM dynamically for its actual decoding lengths. This paged attention
|
||||||
is also effective when multiple requests share the same key and value contents for a large value of beam search or
|
is also effective when multiple requests share the same key and value contents for a large value of beam search or
|
||||||
@@ -139,9 +139,7 @@ Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.
|
|||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
|
ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
|
||||||
on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
||||||
format. For more information, see the guide to
|
format. For more information, see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.
|
||||||
`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_
|
|
||||||
on the ROCm GitHub repository.
|
|
||||||
|
|
||||||
.. _fine-tuning-llms-tgi:
|
.. _fine-tuning-llms-tgi:
|
||||||
|
|
||||||
|
|||||||
407
docs/how-to/performance-validation/mi300x/vllm-benchmark.rst
Normal file
407
docs/how-to/performance-validation/mi300x/vllm-benchmark.rst
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
|
||||||
|
ROCm Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
***********************************************************
|
||||||
|
LLM inference performance validation on AMD Instinct MI300X
|
||||||
|
***********************************************************
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker:
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
|
||||||
|
a prebuilt, optimized environment designed for validating large language model
|
||||||
|
(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
|
||||||
|
ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
|
||||||
|
MI300X accelerator and includes the following components:
|
||||||
|
|
||||||
|
* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
|
||||||
|
|
||||||
|
* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
|
||||||
|
|
||||||
|
* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
|
||||||
|
|
||||||
|
* Tuning files (in CSV format)
|
||||||
|
|
||||||
|
With this Docker image, you can quickly validate the expected inference
|
||||||
|
performance numbers on the MI300X accelerator. This topic also provides tips on
|
||||||
|
optimizing performance with popular AI models.
|
||||||
|
|
||||||
|
.. hlist::
|
||||||
|
:columns: 6
|
||||||
|
|
||||||
|
* Llama 3.1 8B
|
||||||
|
|
||||||
|
* Llama 3.1 70B
|
||||||
|
|
||||||
|
* Llama 3.1 405B
|
||||||
|
|
||||||
|
* Llama 2 7B
|
||||||
|
|
||||||
|
* Llama 2 70B
|
||||||
|
|
||||||
|
* Mixtral 8x7B
|
||||||
|
|
||||||
|
* Mixtral 8x22B
|
||||||
|
|
||||||
|
* Mixtral 7B
|
||||||
|
|
||||||
|
* Qwen2 7B
|
||||||
|
|
||||||
|
* Qwen2 72B
|
||||||
|
|
||||||
|
* JAIS 13B
|
||||||
|
|
||||||
|
* JAIS 30B
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
Getting started
|
||||||
|
===============
|
||||||
|
|
||||||
|
Use the following procedures to reproduce the benchmark results on an
|
||||||
|
MI300X accelerator with the prebuilt vLLM Docker image.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-get-started:
|
||||||
|
|
||||||
|
1. Disable NUMA auto-balancing.
|
||||||
|
|
||||||
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
|
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# disable automatic NUMA balancing
|
||||||
|
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||||
|
cat /proc/sys/kernel/numa_balancing
|
||||||
|
0
|
||||||
|
|
||||||
|
2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
|
||||||
|
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||||
|
|
||||||
|
Once setup is complete, you can choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
||||||
|
|
||||||
|
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad:
|
||||||
|
|
||||||
|
MAD-integrated benchmarking
|
||||||
|
===========================
|
||||||
|
|
||||||
|
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Use this command to run a performance benchmark test of the Llama 3.1 8B model
|
||||||
|
on one GPU with ``float16`` data type in the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||||
|
|
||||||
|
ROCm MAD launches a Docker container with the name
|
||||||
|
``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_float16/``.
|
||||||
|
|
||||||
|
Although the following models are preconfigured to collect latency and
|
||||||
|
throughput performance data, you can also change the benchmarking parameters.
|
||||||
|
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
||||||
|
|
||||||
|
Available models
|
||||||
|
----------------
|
||||||
|
|
||||||
|
.. hlist::
|
||||||
|
:columns: 3
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-8b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-70b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-405b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-2-7b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-2-70b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_mixtral-8x7b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_mixtral-8x22b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_mistral-7b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_qwen2-7b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_qwen2-72b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_jais-13b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_jais-30b``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-8b_fp8``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-70b_fp8``
|
||||||
|
|
||||||
|
* ``pyt_vllm_llama-3.1-405b_fp8``
|
||||||
|
|
||||||
|
* ``pyt_vllm_mixtral-8x7b_fp8``
|
||||||
|
|
||||||
|
* ``pyt_vllm_mixtral-8x22b_fp8``
|
||||||
|
|
||||||
|
.. _vllm-benchmark-standalone:
|
||||||
|
|
||||||
|
Standalone benchmarking
|
||||||
|
=======================
|
||||||
|
|
||||||
|
You can run the vLLM benchmark tool independently by starting the
|
||||||
|
:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
|
||||||
|
snippet.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||||
|
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||||
|
|
||||||
|
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
Command
|
||||||
|
-------
|
||||||
|
|
||||||
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
|
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||||
|
options and their descriptions.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||||
|
|
||||||
|
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. _vllm-benchmark-standalone-options:
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$model_repo``
|
||||||
|
- ``meta-llama/Meta-Llama-3.1-8B-Instruct``
|
||||||
|
- Llama 3.1 8B
|
||||||
|
|
||||||
|
* - (``float16``)
|
||||||
|
- ``meta-llama/Meta-Llama-3.1-70B-Instruct``
|
||||||
|
- Llama 3.1 70B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``meta-llama/Meta-Llama-3.1-405B-Instruct``
|
||||||
|
- Llama 3.1 405B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||||
|
- Llama 2 7B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``meta-llama/Llama-2-70b-chat-hf``
|
||||||
|
- Llama 2 70B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||||
|
- Mixtral 8x7B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``mistralai/Mixtral-8x22B-Instruct-v0.1``
|
||||||
|
- Mixtral 8x22B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``mistralai/Mistral-7B-Instruct-v0.3``
|
||||||
|
- Mixtral 7B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``Qwen/Qwen2-7B-Instruct``
|
||||||
|
- Qwen2 7B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``Qwen/Qwen2-72B-Instruct``
|
||||||
|
- Qwen2 72B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``core42/jais-13b-chat``
|
||||||
|
- JAIS 13B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``core42/jais-30b-chat-v3``
|
||||||
|
- JAIS 30B
|
||||||
|
|
||||||
|
* - ``$model_repo``
|
||||||
|
- ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
|
||||||
|
- Llama 3.1 8B
|
||||||
|
|
||||||
|
* - (``float8``)
|
||||||
|
- ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
|
||||||
|
- Llama 3.1 70B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
|
||||||
|
- Llama 3.1 405B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
|
||||||
|
- Mixtral 8x7B
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
|
||||||
|
- Mixtral 8x22B
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
.. _vllm-benchmark-run-benchmark:
|
||||||
|
|
||||||
|
Running the benchmark on the MI300X accelerator
|
||||||
|
-----------------------------------------------
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options.
|
||||||
|
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
||||||
|
options and their descriptions.
|
||||||
|
|
||||||
|
Example 1: latency benchmark
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||||
|
./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
|
||||||
|
|
||||||
|
Find the latency reports at:
|
||||||
|
|
||||||
|
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
|
||||||
|
|
||||||
|
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
|
||||||
|
|
||||||
|
Example 2: throughput benchmark
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
|
||||||
|
./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
|
||||||
|
|
||||||
|
Find the throughput reports at:
|
||||||
|
|
||||||
|
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
|
||||||
|
|
||||||
|
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
|
||||||
|
|
||||||
|
- To learn how to run LLM models from Hugging Face or your own model, see
|
||||||
|
:doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
|
||||||
|
|
||||||
|
- To learn how to optimize inference on LLMs, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for ROCm, see the
|
||||||
|
:doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
|
||||||
|
|
||||||
|
- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
|
||||||
|
`LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
|
||||||
|
|
||||||
@@ -8,6 +8,8 @@ accelerators. They include detailed instructions on system settings and
|
|||||||
application tuning suggestions to help you fully leverage the capabilities of
|
application tuning suggestions to help you fully leverage the capabilities of
|
||||||
these accelerators, thereby achieving optimal performance.
|
these accelerators, thereby achieving optimal performance.
|
||||||
|
|
||||||
|
* :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`
|
||||||
|
|
||||||
* :doc:`/how-to/tuning-guides/mi300x/system`
|
* :doc:`/how-to/tuning-guides/mi300x/system`
|
||||||
|
|
||||||
* :doc:`/how-to/tuning-guides/mi300x/workload`
|
* :doc:`/how-to/tuning-guides/mi300x/workload`
|
||||||
|
|||||||
@@ -152,9 +152,7 @@ address any new bottlenecks that may emerge.
|
|||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image that has everything required to implement
|
ROCm provides a prebuilt optimized Docker image that has everything required to implement
|
||||||
the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
||||||
format. For more information, see the guide to
|
format. For more information, see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.
|
||||||
`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_
|
|
||||||
on the ROCm GitHub repository.
|
|
||||||
|
|
||||||
.. _mi300x-profiling-tools:
|
.. _mi300x-profiling-tools:
|
||||||
|
|
||||||
@@ -378,11 +376,10 @@ Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.
|
|||||||
for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
|
for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
|
||||||
usage with ROCm.
|
usage with ROCm.
|
||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
|
ROCm provides a prebuilt optimized Docker image for validating the performance
|
||||||
on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
|
||||||
format. For more information, see the guide to
|
ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
|
||||||
`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_
|
see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.
|
||||||
on the ROCm GitHub repository.
|
|
||||||
|
|
||||||
Maximize throughput
|
Maximize throughput
|
||||||
-------------------
|
-------------------
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ ROCm documentation is organized into the following categories:
|
|||||||
* [Using ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
|
* [Using ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
|
||||||
* [Fine-tuning LLMs and inference optimization](./how-to/llm-fine-tuning-optimization/index.rst)
|
* [Fine-tuning LLMs and inference optimization](./how-to/llm-fine-tuning-optimization/index.rst)
|
||||||
* [System optimization](./how-to/system-optimization/index.rst)
|
* [System optimization](./how-to/system-optimization/index.rst)
|
||||||
* [AMD Instinct MI300X tuning guides](./how-to/tuning-guides/mi300x/index.rst)
|
* [AMD Instinct MI300X performance validation and tuning](./how-to/tuning-guides/mi300x/index.rst)
|
||||||
* [GPU cluster networking](https://rocm.docs.amd.com/projects/gpu-cluster-networking/en/latest/index.html)
|
* [GPU cluster networking](https://rocm.docs.amd.com/projects/gpu-cluster-networking/en/latest/index.html)
|
||||||
* [System debugging](./how-to/system-debugging.md)
|
* [System debugging](./how-to/system-debugging.md)
|
||||||
* [Using MPI](./how-to/gpu-enabled-mpi.rst)
|
* [Using MPI](./how-to/gpu-enabled-mpi.rst)
|
||||||
|
|||||||
@@ -70,9 +70,11 @@ subtrees:
|
|||||||
- file: how-to/system-optimization/w6000-v620.md
|
- file: how-to/system-optimization/w6000-v620.md
|
||||||
title: AMD RDNA 2
|
title: AMD RDNA 2
|
||||||
- file: how-to/tuning-guides/mi300x/index.rst
|
- file: how-to/tuning-guides/mi300x/index.rst
|
||||||
title: AMD MI300X tuning guides
|
title: AMD MI300X performance validation and tuning
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
|
- file: how-to/performance-validation/mi300x/vllm-benchmark.rst
|
||||||
|
title: Performance validation
|
||||||
- file: how-to/tuning-guides/mi300x/system.rst
|
- file: how-to/tuning-guides/mi300x/system.rst
|
||||||
title: System tuning
|
title: System tuning
|
||||||
- file: how-to/tuning-guides/mi300x/workload.rst
|
- file: how-to/tuning-guides/mi300x/workload.rst
|
||||||
|
|||||||
Reference in New Issue
Block a user