xdit-diffusion v25.11 docs (#5744)

2026-01-07 22:03:58 -05:00 · 2025-12-05 17:09:48 -05:00
parent effd4174fb
commit f2067767e0
9 changed files with 1019 additions and 0 deletions
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -138,6 +138,7 @@ ESXi
 EP
 EoS
 etcd
+equalto
 fas
 FBGEMM
 FiLM
@@ -226,6 +227,8 @@ href
 Hyperparameters
 HybridEngine
 Huggingface
+Hunyuan
+HunyuanVideo
 IB
 ICD
 ICT
@@ -541,6 +544,7 @@ UAC
 UC
 UCC
 UCX
+ud
 UE
 UIF
 UMC
@@ -852,6 +856,7 @@ pallas
 parallelization
 parallelizing
 param
+params
 parameterization
 passthrough
 pe
@@ -898,6 +903,7 @@ querySelectorAll
 queueing
 qwen
 radeon
+rc
 rccl
 rdc
 rdma
@@ -959,6 +965,7 @@ scalability
 scalable
 scipy
 seealso
+selectattr
 selectedTag
 sendmsg
 seqs
@@ -1062,6 +1069,8 @@ writebacks
 wrreq
 wzo
 xargs
+xdit
+xDiT
 xGMI
 xPacked
 xz
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -145,6 +145,7 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},    
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
@@ -0,0 +1,55 @@
+xdit_diffusion_inference:
+  docker:
+    pull_tag: rocm/pytorch-xdit:v25.10
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+    ROCm: 7.9.0
+    components:
+      TheRock: 7afbe45
+      rccl: 9b04b2a
+      composable_kernel: b7a806f
+      rocm-libraries: f104555
+      rocm-systems: 25922d0
+      torch: 2.10.0a0+gite9c9017
+      torchvision: 0.22.0a0+966da7e
+      triton: 3.5.0+git52e49c12
+      accelerate: 1.11.0.dev0
+      aiter: 0.1.5.post4.dev20+ga25e55e79
+      diffusers: 0.36.0.dev0
+      xfuser: 0.4.4
+      yunchang: 0.6.3.post1
+
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          model_name: wan2_1-i2v-14b-720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+        - model: Wan2.2
+          model_name: wan2_2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+    - group: FLUX
+      tag: flux
+      models:
+        - model: FLUX.1
+          model_name: FLUX.1-dev
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
--- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
@@ -0,0 +1,109 @@
+xdit_diffusion_inference:
+  docker:
+    - version: v25-11
+      pull_tag: rocm/pytorch-xdit:v25.11
+      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+      ROCm: 7.10.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "Minor bug fixes and clarifications to READMEs."
+        - "Bumps TheRock, AITER, Diffusers, xDiT versions."
+        - "Changes Aiter rounding mode for faster gfx942 FWD Attention."
+      components:
+        TheRock: 3e3f834
+        rccl: d23d18f
+        composable_kernel: 2570462
+        rocm-libraries: 0588f07
+        rocm-systems: 473025a
+        torch: 73adac
+        torchvision: f5c6c2e
+        triton: 7416ffc
+        accelerate: 34c1779
+        aiter: de14bec
+        diffusers: 40528e9
+        xfuser: 83978b5
+        yunchang: 2c9b712
+
+    - version: v25-10
+      pull_tag: rocm/pytorch-xdit:v25.10
+      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+      ROCm: 7.9.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "First official xDiT Docker Release for Diffusion Inference."
+        - "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
+        - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
+      components:
+        TheRock: 7afbe45
+        rccl: 9b04b2a
+        composable_kernel: b7a806f
+        rocm-libraries: f104555
+        rocm-systems: 25922d0
+        torch: 2.10.0a0+gite9c9017
+        torchvision: 0.22.0a0+966da7e
+        triton: 3.5.0+git52e49c12
+        accelerate: 1.11.0.dev0
+        aiter: 0.1.5.post4.dev20+ga25e55e79
+        diffusers: 0.36.0.dev0
+        xfuser: 0.4.4
+        yunchang: 0.6.3.post1
+
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          page_tag: hunyuan_tag
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          page_tag: wan_21_tag
+          model_name: wan2_1-i2v-14b-720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+        - model: Wan2.2
+          page_tag: wan_22_tag
+          model_name: wan2_2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+    - group: FLUX
+      tag: flux
+      models:
+        - model: FLUX.1
+          page_tag: flux_1_tag
+          model_name: FLUX.1-dev
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
@@ -0,0 +1,396 @@
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. _xdit-video-diffusion:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized inference environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion-based
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+   This image is based on ROCm {{docker.ROCm}} preview release via `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following software components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+- Initial ROCm-enabled xDiT Docker release for diffusion inference.
+- Supported architectures: gfx942 and gfx950 (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X).
+- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models.
+
+.. _xdit-video-diffusion-supported-models:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length == 1 %}
+                <div class="col-12 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA
+auto-balancing, you can skip this step. Otherwise, complete the procedures in
+the `System validation and optimization
+<https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.html>`__
+guide to properly configure your system settings before starting.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate outputs.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+.. _xdit-video-diffusion-setup:
+
+Prepare the model
+-----------------
+
+.. note::
+
+   If you're using ROCm MAD to :ref:`run your model
+   <xdit-video-diffusion-run>`, you can skip this section. MAD will handle
+   starting the container and downloading required models inside the container.
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+.. _xdit-video-diffusion-run:
+
+Run inference
+=============
+
+You can benchmark models through `MAD <https://github.com/ROCm/MAD>`__-integrated automation or standalone
+torchrun commands.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model tencent/HunyuanVideo \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan2.1
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan2.2
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-A14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model black-forest-labs/FLUX.1-dev \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 1 \
+                  --benchmark_output_directory results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- For a list of other ready-made Docker images for AI with ROCm, see `AMD
+  Infinity Hub
+  <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`__.
+
+Previous versions
+=================
+
+See :doc:`xdit-history` to find documentation for previous releases
+of xDiT diffusion inference performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
@@ -0,0 +1,56 @@
+:orphan:
+
+************************************************************
+xDiT diffusion inference performance testing version history
+************************************************************
+
+This table lists previous versions of the ROCm xDiT diffusion inference performance
+testing environment. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``rocm/pytorch-xdit:v25.11`` (latest)
+     - 
+       * ROCm 7.10.0 preview
+       * TheRock 3e3f834
+       * rccl d23d18f
+       * composable_kernel 2570462
+       * rocm-libraries 0588f07
+       * rocm-systems 473025a
+       * torch 73adac
+       * torchvision f5c6c2e
+       * triton 7416ffc
+       * accelerate 34c1779
+       * aiter de14bec
+       * diffusers 40528e9
+       * xfuser 83978b5
+       * yunchang 2c9b712
+     - 
+       * :doc:`Documentation <../../xdit-diffusion-inference>`
+       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
+
+   * - ``rocm/pytorch-xdit:v25.10``
+     - 
+       * ROCm 7.9.0 preview
+       * TheRock 7afbe45
+       * rccl 9b04b2a
+       * composable_kernel b7a806f
+       * rocm-libraries f104555
+       * rocm-systems 25922d0
+       * torch 2.10.0a0+gite9c9017
+       * torchvision 0.22.0a0+966da7e
+       * triton 3.5.0+git52e49c12
+       * accelerate 1.11.0.dev0
+       * aiter 0.1.5.post4.dev20+ga25e55e79
+       * diffusers 0.36.0.dev0
+       * xfuser 0.4.4
+       * yunchang 0.6.3.post1
+     - 
+       * :doc:`Documentation <xdit-25.10>`
+       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -27,3 +27,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`SGLang inference performance testing <benchmark-docker/sglang>`

 - :doc:`Deploying your model <deploy-your-model>`
+
+- :doc:`xDiT diffusion inference <xdit-diffusion-inference>`
+
--- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
@@ -0,0 +1,388 @@
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. _xdit-video-diffusion:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
+   benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. dropdown:: Software components
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_version in docker.components.items() %}
+         * - {{ component_name }}
+           - {{ component_version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
+   
+   {# Create a lookup for supported models #}
+   {% set supported_lookup = {} %}
+   {% for supported in docker.supported_models %}
+   {% set _ = supported_lookup.update({supported.group: supported.models}) %}
+   {% endfor %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+            {% set supported_models = supported_lookup[model_group.group] %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if model.model in supported_models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+                {% endif %}
+            {% endfor %}
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate outputs.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.page_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.page_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model tencent/HunyuanVideo \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan2.1
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan2.2
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-A14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model black-forest-labs/FLUX.1-dev \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 1 \
+                  --benchmark_output_directory results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
+of xDiT diffusion inference performance testing.
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -117,6 +117,8 @@ subtrees:
            title: SGLang inference performance testing
          - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
            title: SGLang distributed inference with Mooncake
+          - file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
+            title: xDiT diffusion inference
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model