From 4132a2609c49ba2ef735539d816dcaa87b1ccae7 Mon Sep 17 00:00:00 2001
From: Kristoffer <kristoffer.torp@amd.com>
Date: Mon, 27 Oct 2025 14:56:55 +0100
Subject: [PATCH] Add xdit diffusion docs (#5576)

* Add xdit video diffusion base page.

* Update supported accelerators.

* Remove dependency on mad-tags.

* Update docker pull section.

* Update container launch instructions.

* Improve launch instruction options and layout.

* Add benchmark result outputs.

* Fix wrong HunyuanVideo path

* Finalize instructions.

* Consistent title.

* Make page and side-bar titles the same.

* Updated wordlist. Removed note container reg HF.

* Remove fp8_gemms in command and add release notes.

* Update accelerators naming.

* Add note regarding OOB performance.

* Fix admonition box.

* Overall fixes.
---
 .wordlist.txt                                 |   6 +
 docs/conf.py                                  |   1 +
 .../inference/xdit-inference-models.yaml      |  38 +++
 docs/how-to/rocm-for-ai/inference/index.rst   |   4 +-
 .../inference/xdit-video-diffusion.rst        | 322 ++++++++++++++++++
 docs/sphinx/_toc.yml.in                       |   2 +
 6 files changed, 372 insertions(+), 1 deletion(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index aed9dc1cc..68185fbe9 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -220,6 +220,7 @@ href
 Hyperparameters
 HybridEngine
 Huggingface
+Hunyuan
 IB
 ICD
 ICT
@@ -531,6 +532,7 @@ UAC
 UC
 UCC
 UCX
+ud
 UE
 UIF
 UMC
@@ -842,6 +844,7 @@ pallas
 parallelization
 parallelizing
 param
+params
 parameterization
 passthrough
 pe
@@ -888,6 +891,7 @@ querySelectorAll
 queueing
 qwen
 radeon
+rc
 rccl
 rdc
 rdma
@@ -1052,6 +1056,8 @@ writebacks
 wrreq
 wzo
 xargs
+xdit
+xDiT
 xGMI
 xPacked
 xz
diff --git a/docs/conf.py b/docs/conf.py
index 5a6298e04..85c6863ba 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -175,6 +175,7 @@ article_pages = [
     {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-video-diffusion", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
 
     {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
new file mode 100644
index 000000000..60f52aae7
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
@@ -0,0 +1,38 @@
+xdit_video_diffusion:
+  docker:
+    pull_tag: amdsiloai/pytorch-xdit:v25.9
+    docker_hub_url: https://hub.docker.com/r/amdsiloai/pytorch-xdit
+    ROCm: 7.0.0rc
+    components:
+      TheRock: 7afbe45
+      rccl: 9b04b2a
+      composable_kernel: b7a806f
+      rocm-libraries: f104555
+      rocm-systems: 25922d0
+      torch: 2.10.0a0+git3caf6da
+      torchvision: 0.22.0a0+966da7e
+      triton: 3.5.0+gitea06d636
+       
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          model_name: wan2.1_i2v_14b_720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+        - model: Wan2.2
+          model_name: wan2.2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
\ No newline at end of file
diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst
index 6eb705141..4f66fd82f 100644
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -26,4 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 
 - :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
 
-- :doc:`Deploying your model <deploy-your-model>`
+- :doc:`xDiT video inference <xdit-video-diffusion>`
+
+- :doc:`Deploying your model <deploy-your-model>`
\ No newline at end of file
diff --git a/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst
new file mode 100644
index 000000000..af98cc187
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst
@@ -0,0 +1,322 @@
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, validate, benchmark
+
+********************
+xDiT video inference
+********************
+
+.. _xdit-video-diffusion:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_video_diffusion.docker %}
+   {% set model_groups = data.xdit_video_diffusion.model_groups%}
+
+   The `amdsiloai/pytorch-xdit Docker <{{ docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
+   benchmarking diffusion model video generation on
+   AMD Instinct™ MI355X, MI350X (gfx950), and MI300X GPUs.
+   The image runs ROCm `{{docker.ROCm}}` based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+
+What's new
+==========
+
+- Initial release
+- ROCm: 7.0.0rc
+- Added support for AMD Instinct™ MI355X, MI350X (gfx950), and MI300X (gfx942) GPUs.
+- Added support for Wan 2.1, Wan 2.2 and Hunyuan Video models with MIOpen optimizations.
+
+.. _xdit-video-diffusion-supported-models:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_video_diffusion.docker %}
+   {% set model_groups = data.xdit_video_diffusion.model_groups%}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+                  <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.model_name }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.model_name }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.model_name}}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_video_diffusion.docker %}
+
+   For this tutorial, it's recommended to use the ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate a video.
+
+.. warning::
+
+   If your host/OS ROCm installation is below 6.4.2 (see with ``apt show rocm-libs``) you need to export
+   the ``HSA_NO_SCRATCH_RECLAIM=1`` environment variable inside the container, or the workload will crash.
+   If possible, ask your system administrator to upgrade ROCm.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.model_name}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_video_diffusion.docker %}
+   {% set model_groups = data.xdit_video_diffusion.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.model_name}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set model_groups = data.xdit_video_diffusion.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.model_name }}
+
+      To run the benchmarks for {{ model.model }}, use the following command:
+
+      .. code-block:: shell
+       {% if model.model == "Hunyuan Video" %}
+         cd /app/Hunyuanvideo
+         mkdir results
+
+         torchrun --nproc_per_node=8 run.py \
+             --model tencent/HunyuanVideo \
+             --prompt "In the large cage, two puppies were wagging their tails at each other." \
+             --height 720 --width 1280 --num_frames 129 \
+             --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+             --ulysses_degree 8 \
+             --enable_tiling --enable_slicing \
+             --use_torch_compile \
+             --bench_output results
+       {% endif %}
+       {% if model.model == "Wan2.1" %}
+         cd Wan2.1
+         mkdir results
+
+         torchrun --nproc_per_node=8 run.py \
+             --task i2v-14B \
+             --size 720*1280 --frame_num 81 \
+             --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+             --image "/app/Wan2.1/examples/i2v_input.JPG" \
+             --ulysses_size 8 --ring_size 1 \
+             --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+             --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+             --offload_model 0 \
+             --vae_dtype bfloat16
+       {% endif %}
+       {% if model.model == "Wan2.2" %}
+         cd Wan2.2
+         mkdir results
+
+         torchrun --nproc_per_node=8 run.py \
+             --task i2v-A14B \
+             --size 720*1280 --frame_num 81 \
+             --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+             --image "/app/Wan2.2/examples/i2v_input.JPG" \
+             --ulysses_size 8 --ring_size 1 \
+             --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+             --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+             --offload_model 0 \
+             --vae_dtype bfloat16
+       {% endif %}
+
+      {% if model.model in ["Wan2.1", "Wan2.2"] %}
+      For additional performance improvements, consider adding the ``--compile`` flag to the above command. Note that this can significantly increase startup time on the first call.
+      {% endif %}
+
+      The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Known limitations
+=================
+
+- OOB tuning: Currently only Instinct MI300X has been tuned for in the gfx942
+  series. Other gfx942 GPUs might not perform optimally out-of-the-box.
\ No newline at end of file
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index a0a5084ff..253f4416f 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -117,6 +117,8 @@ subtrees:
             title: SGLang inference performance testing
           - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
             title: SGLang distributed inference with Mooncake
+          - file: how-to/rocm-for-ai/inference/xdit-video-diffusion.rst
+            title: xDiT video inference
           - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
             title: Deploy your model