From c3faa9670b52d06705a21be07a4dd55d93e7745b Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Wed, 23 Apr 2025 17:35:52 -0400
Subject: [PATCH] Add PyTorch inference benchmark Docker guide (+ CLIP and
 Chai-1) (#4654)

* update vLLM links in deploy-your-model.rst

* add pytorch inference benchmark doc

* update toc and vLLM title

* remove previous versions

* update

* wording

* fix link and "applies to"

* add pytorch to wordlist

* add tunableop note to clip

* make tunableop note appear to all models

* Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* Update docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

* fix incorrect links

* wording

* fix wrong docker pull tag

---------

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>
---
 .wordlist.txt                                 |   1 +
 docs/conf.py                                  |   1 +
 .../pytorch-inference-benchmark-models.yaml   |  25 +++
 .../inference/deploy-your-model.rst           |   7 +-
 .../inference/pytorch-inference-benchmark.rst | 163 ++++++++++++++++++
 .../rocm-for-ai/inference/vllm-benchmark.rst  |   8 +-
 docs/sphinx/_toc.yml.in                       |   4 +-
 7 files changed, 200 insertions(+), 9 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index ae120d2e9..71ffb2870 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -752,6 +752,7 @@ profilers
 protobuf
 pseudorandom
 py
+pytorch
 recommender
 recommenders
 quantile
diff --git a/docs/conf.py b/docs/conf.py
index df15f7a26..d04a9796b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -70,6 +70,7 @@ article_pages = [
     {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
 
     {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
diff --git a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
new file mode 100644
index 000000000..65a8914c3
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -0,0 +1,25 @@
+pytorch_inference_benchmark:
+  unified_docker:
+    latest: &rocm-pytorch-docker-latest
+      pull_tag: rocm/pytorch:latest
+      docker_hub_url:
+      rocm_version:
+      pytorch_version:
+      hipblaslt_version:
+  model_groups:
+    - group: CLIP
+      tag: clip
+      models:
+      - model: CLIP
+        mad_tag: pyt_clip_inference
+        model_repo: laion/CLIP-ViT-B-32-laion2B-s34B-b79K
+        url: https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
+        precision: float16
+    - group: Chai-1
+      tag: chai
+      models:
+      - model: Chai-1
+        mad_tag: pyt_chai1_inference
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/chaidiscovery/chai-1
+        precision: float16
diff --git a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
index a820739b3..fc5bc7732 100644
--- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
@@ -16,8 +16,7 @@ ROCm supports vLLM and Hugging Face TGI as major LLM-serving frameworks.
 Serving using vLLM
 ==================
 
-vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM officially supports ROCm versions 5.7 and
-6.0. AMD is actively working with the vLLM team to improve performance and support later ROCm versions.
+vLLM is a fast and easy-to-use library for LLM inference and serving. AMD is actively working with the vLLM team to improve performance and support the latest ROCm versions.
 
 See the `GitHub repository <https://github.com/vllm-project/vllm>`_ and `official vLLM documentation
 <https://docs.vllm.ai/>`_ for more information.
@@ -31,9 +30,9 @@ vLLM installation
 vLLM supports two ROCm-capable installation methods. Refer to the official documentation use the following links.
 
 -  `Build from source with Docker
-   <https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-docker-rocm>`_ (recommended)
+   <https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm#build-image-from-source>`_ (recommended)
 
--  `Build from source <https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm>`_
+-  `Build from source <https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm#build-wheel-from-source>`_
 
 vLLM walkthrough
 ----------------
diff --git a/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst b/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
new file mode 100644
index 000000000..3cf8bca03
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
@@ -0,0 +1,163 @@
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm PyTorch Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate, pytorch
+
+*************************************
+PyTorch inference performance testing
+*************************************
+
+.. _pytorch-inference-benchmark-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+
+   {% set unified_docker = data.pytorch_inference_benchmark.unified_docker.latest %}
+   {% set model_groups = data.pytorch_inference_benchmark.model_groups %}
+
+   The `ROCm PyTorch Docker <https://hub.docker.com/r/rocm/pytorch/tags>`_ image offers a prebuilt,
+   optimized environment for testing model inference performance on AMD Instinct™ MI300X series
+   accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
+   tool with the ROCm PyTorch container to test inference performance on various models efficiently.
+
+   .. _pytorch-inference-benchmark-available-models:
+
+   Supported models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1" style="display: none;">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+            <div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization before use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   Getting started
+   ===============
+
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X series accelerator with the prebuilt PyTorch Docker image.
+
+   .. _pytorch-benchmark-get-started:
+
+   1. Disable NUMA auto-balancing.
+
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+
+      .. code-block:: shell
+
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0
+
+   .. container:: model-doc pyt_chai1_inference
+
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
+
+   .. container:: model-doc pyt_clip_inference
+
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull rocm/pytorch:latest
+
+   Benchmarking
+   ============
+
+   .. _pytorch-inference-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      To simplify performance testing, the ROCm Model Automation and Dashboarding
+      (`<https://github.com/ROCm/MAD>`__) project provides ready-to-use scripts and configuration.
+      To start, clone the  MAD repository to a local directory and install the required packages on the
+      host machine.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD
+         pip install -r requirements.txt
+
+      Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+      using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+      .. code-block:: shell
+
+         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+         python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+      MAD launches a Docker container with the name
+      ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+      model are collected in ``perf.csv``.
+
+      .. note::
+
+         For improved performance, consider enabling TunableOp. By default,
+         ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+         `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable
+         it, edit the default run behavior in the ``tools/run_models.py``-- update the model's
+         run ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+         Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+         Although this might increase the initial training time, it can result in a performance gain.
+
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- To learn how to run LLM models from Hugging Face or your model, see
+  :doc:`Running models from Hugging Face <hugging-face-models>`.
+
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../inference-optimization/index>`.
+
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
diff --git a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
index 5b4fa0476..437a50eb8 100644
--- a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
+++ b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
@@ -3,9 +3,9 @@
                  ROCm vLLM Docker image.
    :keywords: model, MAD, automation, dashboarding, validate
 
-********************************************************
-LLM inference performance testing on AMD Instinct MI300X
-********************************************************
+**********************************
+vLLM inference performance testing
+**********************************
 
 .. _vllm-benchmark-unified-docker:
 
@@ -16,7 +16,7 @@ LLM inference performance testing on AMD Instinct MI300X
 
    The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
    a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
    Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
    accelerators and includes the following components:
 
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index a3c6e2db9..979f510b4 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -75,7 +75,9 @@ subtrees:
           - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
             title: LLM inference frameworks
           - file: how-to/rocm-for-ai/inference/vllm-benchmark.rst
-            title: Performance testing
+            title: vLLM inference performance testing
+          - file: how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
+            title: PyTorch inference performance testing
           - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
             title: Deploy your model