From c3faa9670b52d06705a21be07a4dd55d93e7745b Mon Sep 17 00:00:00 2001 From: Peter Park Date: Wed, 23 Apr 2025 17:35:52 -0400 Subject: [PATCH] Add PyTorch inference benchmark Docker guide (+ CLIP and Chai-1) (#4654) * update vLLM links in deploy-your-model.rst * add pytorch inference benchmark doc * update toc and vLLM title * remove previous versions * update * wording * fix link and "applies to" * add pytorch to wordlist * add tunableop note to clip * make tunableop note appear to all models * Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * fix incorrect links * wording * fix wrong docker pull tag --------- Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --- .wordlist.txt | 1 + docs/conf.py | 1 + .../pytorch-inference-benchmark-models.yaml | 25 +++ .../inference/deploy-your-model.rst | 7 +- .../inference/pytorch-inference-benchmark.rst | 163 ++++++++++++++++++ .../rocm-for-ai/inference/vllm-benchmark.rst | 8 +- docs/sphinx/_toc.yml.in | 4 +- 7 files changed, 200 insertions(+), 9 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst diff --git a/.wordlist.txt b/.wordlist.txt index ae120d2e9..71ffb2870 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -752,6 +752,7 @@ profilers protobuf pseudorandom py +pytorch recommender recommenders quantile diff --git a/docs/conf.py b/docs/conf.py index df15f7a26..d04a9796b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,6 +70,7 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml new file mode 100644 index 000000000..65a8914c3 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml @@ -0,0 +1,25 @@ +pytorch_inference_benchmark: + unified_docker: + latest: &rocm-pytorch-docker-latest + pull_tag: rocm/pytorch:latest + docker_hub_url: + rocm_version: + pytorch_version: + hipblaslt_version: + model_groups: + - group: CLIP + tag: clip + models: + - model: CLIP + mad_tag: pyt_clip_inference + model_repo: laion/CLIP-ViT-B-32-laion2B-s34B-b79K + url: https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K + precision: float16 + - group: Chai-1 + tag: chai + models: + - model: Chai-1 + mad_tag: pyt_chai1_inference + model_repo: meta-llama/Llama-3.1-8B-Instruct + url: https://huggingface.co/chaidiscovery/chai-1 + precision: float16 diff --git a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst index a820739b3..fc5bc7732 100644 --- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst +++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst @@ -16,8 +16,7 @@ ROCm supports vLLM and Hugging Face TGI as major LLM-serving frameworks. Serving using vLLM ================== -vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM officially supports ROCm versions 5.7 and -6.0. AMD is actively working with the vLLM team to improve performance and support later ROCm versions. +vLLM is a fast and easy-to-use library for LLM inference and serving. AMD is actively working with the vLLM team to improve performance and support the latest ROCm versions. See the `GitHub repository `_ and `official vLLM documentation `_ for more information. @@ -31,9 +30,9 @@ vLLM installation vLLM supports two ROCm-capable installation methods. Refer to the official documentation use the following links. - `Build from source with Docker - `_ (recommended) + `_ (recommended) -- `Build from source `_ +- `Build from source `_ vLLM walkthrough ---------------- diff --git a/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst b/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst new file mode 100644 index 000000000..3cf8bca03 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst @@ -0,0 +1,163 @@ +.. meta:: + :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the + ROCm PyTorch Docker image. + :keywords: model, MAD, automation, dashboarding, validate, pytorch + +************************************* +PyTorch inference performance testing +************************************* + +.. _pytorch-inference-benchmark-docker: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml + + {% set unified_docker = data.pytorch_inference_benchmark.unified_docker.latest %} + {% set model_groups = data.pytorch_inference_benchmark.model_groups %} + + The `ROCm PyTorch Docker `_ image offers a prebuilt, + optimized environment for testing model inference performance on AMD Instinctâ„¢ MI300X series + accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD) + tool with the ROCm PyTorch container to test inference performance on various models efficiently. + + .. _pytorch-inference-benchmark-available-models: + + Supported models + ================ + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ + +
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + .. note:: + + See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model. + Some models require access authorization before use via an external license agreement through a third party. + + {% endfor %} + {% endfor %} + + Getting started + =============== + + Use the following procedures to reproduce the benchmark results on an + MI300X series accelerator with the prebuilt PyTorch Docker image. + + .. _pytorch-benchmark-get-started: + + 1. Disable NUMA auto-balancing. + + To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU + might hang until the periodic balancing is finalized. For more information, + see :ref:`AMD Instinct MI300X system optimization `. + + .. code-block:: shell + + # disable automatic NUMA balancing + sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' + # check if NUMA balancing is disabled (returns 0 if disabled) + cat /proc/sys/kernel/numa_balancing + 0 + + .. container:: model-doc pyt_chai1_inference + + 2. Use the following command to pull the `ROCm PyTorch Docker image `_ from Docker Hub. + + .. code-block:: shell + + docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue + + .. container:: model-doc pyt_clip_inference + + 2. Use the following command to pull the `ROCm PyTorch Docker image `_ from Docker Hub. + + .. code-block:: shell + + docker pull rocm/pytorch:latest + + Benchmarking + ============ + + .. _pytorch-inference-benchmark-mad: + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + To simplify performance testing, the ROCm Model Automation and Dashboarding + (``__) project provides ready-to-use scripts and configuration. + To start, clone the MAD repository to a local directory and install the required packages on the + host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model + using one GPU with the ``{{model.precision}}`` data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the + model are collected in ``perf.csv``. + + .. note:: + + For improved performance, consider enabling TunableOp. By default, + ``{{model.mad_tag}}`` runs with TunableOp disabled (see + ``__). To enable + it, edit the default run behavior in the ``tools/run_models.py``-- update the model's + run ``args`` by changing ``--tunableop off`` to ``--tunableop on``. + + Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run. + Although this might increase the initial training time, it can result in a performance gain. + + {% endfor %} + {% endfor %} + +Further reading +=============== + +- To learn more about system settings and management practices to configure your system for + MI300X accelerators, see `AMD Instinct MI300X system optimization `_. + +- To learn how to run LLM models from Hugging Face or your model, see + :doc:`Running models from Hugging Face `. + +- To learn how to optimize inference on LLMs, see + :doc:`Inference optimization <../inference-optimization/index>`. + +- To learn how to fine-tune LLMs, see + :doc:`Fine-tuning LLMs <../fine-tuning/index>`. diff --git a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst index 5b4fa0476..437a50eb8 100644 --- a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst +++ b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst @@ -3,9 +3,9 @@ ROCm vLLM Docker image. :keywords: model, MAD, automation, dashboarding, validate -******************************************************** -LLM inference performance testing on AMD Instinct MI300X -******************************************************** +********************************** +vLLM inference performance testing +********************************** .. _vllm-benchmark-unified-docker: @@ -16,7 +16,7 @@ LLM inference performance testing on AMD Instinct MI300X The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment for validating large language model (LLM) - inference performance on AMD Instinctâ„¢ MI300X series accelerator. This ROCm vLLM + inference performance on AMD Instinctâ„¢ MI300X series accelerators. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for MI300X series accelerators and includes the following components: diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a3c6e2db9..979f510b4 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -75,7 +75,9 @@ subtrees: - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst title: LLM inference frameworks - file: how-to/rocm-for-ai/inference/vllm-benchmark.rst - title: Performance testing + title: vLLM inference performance testing + - file: how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst + title: PyTorch inference performance testing - file: how-to/rocm-for-ai/inference/deploy-your-model.rst title: Deploy your model