Update docs for xDiT diffusion inference 26.1 (#5955)

* archive previous version * xDiT diffusion inference docker 26.1
2026-02-12 07:25:22 -05:00 · 2026-02-11 13:27:36 -05:00
parent 19891f8ef1
commit fe8dff691d
6 changed files with 795 additions and 189 deletions
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
@@ -0,0 +1,105 @@
+docker:
+  pull_tag: rocm/pytorch-xdit:v25.13
+  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
+  ROCm: 7.11.0
+  whats_new:
+    - "Flux.1 Kontext support"
+    - "Flux.2 Dev support"
+    - "Flux FP8 GEMM support"
+    - "Hybrid FP8 attention support for Wan models"
+  components:
+    TheRock: 
+      version: 1728a81
+      url: https://github.com/ROCm/TheRock
+    rccl:
+      version: d23d18f
+      url: https://github.com/ROCm/rccl
+    composable_kernel:
+      version: ab0101c
+      url: https://github.com/ROCm/composable_kernel
+    rocm-libraries:
+      version: a2f7c35
+      url: https://github.com/ROCm/rocm-libraries
+    rocm-systems:
+      version: 659737c
+      url: https://github.com/ROCm/rocm-systems
+    torch:
+      version: 91be249
+      url: https://github.com/ROCm/pytorch
+    torchvision:
+      version: b919bd0
+      url: https://github.com/pytorch/vision
+    triton:
+      version: a272dfa
+      url: https://github.com/ROCm/triton
+    accelerate:
+      version: b521400f
+      url: https://github.com/huggingface/accelerate
+    aiter:
+      version: de14bec0
+      url: https://github.com/ROCm/aiter
+    diffusers:
+      version: a1f36ee3e
+      url: https://github.com/huggingface/diffusers
+    xfuser:
+      version: adf2681
+      url: https://github.com/xdit-project/xDiT
+    yunchang:
+      version: 2c9b712
+      url: https://github.com/feifeibear/long-context-attention
+  supported_models:
+    - group: Hunyuan Video
+      js_tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+          js_tag: hunyuan_tag
+    - group: Wan-AI
+      js_tag: wan
+      models:
+        - model: Wan2.1
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+          js_tag: wan_21_tag
+        - model: Wan2.2
+          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+          js_tag: wan_22_tag
+    - group: FLUX
+      js_tag: flux
+      models:
+        - model: FLUX.1
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
+          js_tag: flux_1_tag
+        - model: FLUX.1 Kontext
+          model_repo: black-forest-labs/FLUX.1-Kontext-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux_kontext
+          js_tag: flux_1_kontext_tag
+        - model: FLUX.2
+          model_repo: black-forest-labs/FLUX.2-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.2-dev
+          github: https://github.com/black-forest-labs/flux2
+          mad_tag: pyt_xdit_flux_2
+          js_tag: flux_2_tag
+    - group: StableDiffusion
+      js_tag: stablediffusion
+      models:
+        - model: stable-diffusion-3.5-large
+          model_repo: stabilityai/stable-diffusion-3.5-large
+          url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
+          github: https://github.com/Stability-AI/sd3.5
+          mad_tag: pyt_xdit_sd_3_5
+          js_tag: stable_diffusion_3_5_large_tag
--- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
@@ -1,14 +1,13 @@
 docker:
-  pull_tag: rocm/pytorch-xdit:v25.13
-  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
+  pull_tag: rocm/pytorch-xdit:v26.1
+  docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
  ROCm: 7.11.0
  whats_new:
-    - "Flux.1 Kontext support"
-    - "Flux.2 Dev support"
-    - "Flux FP8 GEMM support"
-    - "Hybrid FP8 attention support for Wan models"
+    - "HunyuanVideo 1.5 support"
+    - "Z-Image Turbo support"
+    - "Wan model sharding"
  components:
-    TheRock: 
+    TheRock:
      version: 1728a81
      url: https://github.com/ROCm/TheRock
    rccl:
@@ -39,10 +38,10 @@ docker:
      version: de14bec0
      url: https://github.com/ROCm/aiter
    diffusers:
-      version: a1f36ee3e
+      version: 6708f5
      url: https://github.com/huggingface/diffusers
    xfuser:
-      version: adf2681
+      version: 0a3d7a
      url: https://github.com/xdit-project/xDiT
    yunchang:
      version: 2c9b712
@@ -58,6 +57,49 @@ docker:
          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
          mad_tag: pyt_xdit_hunyuanvideo
          js_tag: hunyuan_tag
+          benchmark_command:
+            - cd /app/Hunyuanvideo
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 run.py \'
+            - '--model {model_repo} \'
+            - '--prompt "In the large cage, two puppies were wagging their tails at each other." \'
+            - '--batch_size 1 \'
+            - '--height 720 --width 1280 \'
+            - '--seed 1168860793 \'
+            - '--num_frames 129 \'
+            - '--num_inference_steps 50 \'
+            - '--warmup_steps 1 \'
+            - '--n_repeats 1 \'
+            - '--sleep_dur 10 \'
+            - '--ulysses_degree 8 \'
+            - '--enable_tiling --enable_slicing \'
+            - '--guidance_scale 6.0 \'
+            - '--use_torch_compile \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
+        - model: Hunyuan Video 1.5
+          model_repo: hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v
+          url: https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5
+          mad_tag: pyt_xdit_hunyuanvideo_1_5
+          js_tag: hunyuan_1_5_tag
+          benchmark_command:
+            - cd /app/Hunyuanvideo_1_5
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Hunyuanvideo_1_5/run.py \'
+            - '--model {model_repo} \'
+            - '--prompt "In the large cage, two puppies were wagging their tails at each other." \'
+            - '--task t2v \'
+            - '--height 720 --width 1280 \'
+            - '--seed 1168860793 \'
+            - '--num_frames 129 \'
+            - '--num_inference_steps 50 \'
+            - '--num_repetitions 1 \'
+            - '--ulysses_degree 8 \'
+            - '--enable_tiling --enable_slicing \'
+            - '--use_torch_compile \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
    - group: Wan-AI
      js_tag: wan
      models:
@@ -67,12 +109,48 @@ docker:
          github: https://github.com/Wan-Video/Wan2.1
          mad_tag: pyt_xdit_wan_2_1
          js_tag: wan_21_tag
+          benchmark_command:
+            - cd /app/Wan
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Wan/run.py \'
+            - '--model {model_repo} \'
+            - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \'
+            - '--task i2v \'
+            - '--height 720 \'
+            - '--width 1280 \'
+            - '--img_file_path /app/Wan/i2v_input.JPG \'
+            - '--num_frames 81 \'
+            - '--ulysses_degree 8 \'
+            - '--seed 42 \'
+            - '--num_repetitions 1 \'
+            - '--num_inference_steps 40 \'
+            - '--use_torch_compile \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
        - model: Wan2.2
          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
          github: https://github.com/Wan-Video/Wan2.2
          mad_tag: pyt_xdit_wan_2_2
          js_tag: wan_22_tag
+          benchmark_command:
+            - cd /app/Wan
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Wan/run.py \'
+            - '--model {model_repo} \'
+            - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \'
+            - '--task i2v \'
+            - '--height 720 \'
+            - '--width 1280 \'
+            - '--img_file_path /app/Wan/i2v_input.JPG \'
+            - '--num_frames 81 \'
+            - '--ulysses_degree 8 \'
+            - '--seed 42 \'
+            - '--num_repetitions 1 \'
+            - '--num_inference_steps 40 \'
+            - '--use_torch_compile \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
    - group: FLUX
      js_tag: flux
      models:
@@ -82,18 +160,79 @@ docker:
          github: https://github.com/black-forest-labs/flux
          mad_tag: pyt_xdit_flux
          js_tag: flux_1_tag
+          benchmark_command:
+            - cd /app/Flux
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Flux/run.py \'
+            - '--model {model_repo} \'
+            - '--seed 42 \'
+            - '--prompt "A small cat" \'
+            - '--height 1024 \'
+            - '--width 1024 \'
+            - '--num_inference_steps 25 \'
+            - '--max_sequence_length 256 \'
+            - '--warmup_steps 5 \'
+            - '--no_use_resolution_binning \'
+            - '--ulysses_degree 8 \'
+            - '--use_torch_compile \'
+            - '--guidance_scale 0.0 \'
+            - '--num_repetitions 50 \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
        - model: FLUX.1 Kontext
          model_repo: black-forest-labs/FLUX.1-Kontext-dev
          url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
          github: https://github.com/black-forest-labs/flux
          mad_tag: pyt_xdit_flux_kontext
          js_tag: flux_1_kontext_tag
+          benchmark_command:
+            - cd /app/Flux
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \'
+            - '--model {model_repo} \'
+            - '--seed 42 \'
+            - '--prompt "Add a cool hat to the cat" \'
+            - '--height 1024 \'
+            - '--width 1024 \'
+            - '--num_inference_steps 30 \'
+            - '--max_sequence_length 512 \'
+            - '--warmup_steps 5 \'
+            - '--no_use_resolution_binning \'
+            - '--ulysses_degree 8 \'
+            - '--use_torch_compile \'
+            - '--img_file_path /app/Flux/cat.png \'
+            - '--model_type flux_kontext \'
+            - '--guidance_scale 2.5 \'
+            - '--num_repetitions 25 \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
        - model: FLUX.2
          model_repo: black-forest-labs/FLUX.2-dev
          url: https://huggingface.co/black-forest-labs/FLUX.2-dev
          github: https://github.com/black-forest-labs/flux2
          mad_tag: pyt_xdit_flux_2
          js_tag: flux_2_tag
+          benchmark_command:
+            - cd /app/Flux
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \'
+            - '--model {model_repo} \'
+            - '--seed 42 \'
+            - '--prompt "Add a cool hat to the cat" \'
+            - '--height 1024 \'
+            - '--width 1024 \'
+            - '--num_inference_steps 50 \'
+            - '--max_sequence_length 512 \'
+            - '--warmup_steps 5 \'
+            - '--no_use_resolution_binning \'
+            - '--ulysses_degree 8 \'
+            - '--use_torch_compile \'
+            - '--img_file_paths /app/Flux/cat.png \'
+            - '--model_type flux2 \'
+            - '--guidance_scale 4.0 \'
+            - '--num_repetitions 25 \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
    - group: StableDiffusion
      js_tag: stablediffusion
      models:
@@ -103,3 +242,42 @@ docker:
          github: https://github.com/Stability-AI/sd3.5
          mad_tag: pyt_xdit_sd_3_5
          js_tag: stable_diffusion_3_5_large_tag
+          benchmark_command:
+            - cd /app/StableDiffusion3.5
+            - mkdir results
+            - 'torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \'
+            - '--model {model_repo} \'
+            - '--prompt "A capybara holding a sign that reads Hello World" \'
+            - '--num_repetitions 50 \'
+            - '--num_inference_steps 28 \'
+            - '--pipefusion_parallel_degree 4 \'
+            - '--use_cfg_parallel \'
+            - '--use_torch_compile \'
+            - '--dtype torch.float16 \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
+    - group: Z-Image
+      js_tag: z_image
+      models:
+        - model: Z-Image Turbo
+          model_repo: Tongyi-MAI/Z-Image-Turbo
+          url: https://huggingface.co/Tongyi-MAI/Z-Image-Turbo
+          github: https://github.com/Tongyi-MAI/Z-Image
+          mad_tag: pyt_xdit_z_image_turbo
+          js_tag: z_image_turbo_tag
+          benchmark_command:
+            - cd /app/Z-Image
+            - mkdir results
+            - 'torchrun --nproc_per_node=2 /app/Z-Image/run.py \'
+            - '--model {model_repo} \'
+            - '--seed 42 \'
+            - '--prompt "A crowded beach" \'
+            - '--height 1088 \'
+            - '--width 1920 \'
+            - '--num_inference_steps 9 \'
+            - '--ulysses_degree 2 \'
+            - '--use_torch_compile \'
+            - '--guidance_scale 0.0 \'
+            - '--num_repetitions 50 \'
+            - '--attention_backend aiter \'
+            - '--benchmark_output_directory results'
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
@@ -11,7 +11,7 @@ xDiT diffusion inference

 .. caution::

-   This documentation does not reflect the latest version of ROCm vLLM
+   This documentation does not reflect the latest version of xDiT diffusion
   inference performance documentation. See
   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
   version.
@@ -293,7 +293,7 @@ Run inference
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
                      --live-output
-                     
+
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
@@ -379,7 +379,7 @@ Run inference
            {% endif %}

            {% if model.model == "stable-diffusion-3.5-large" %}
-               cd StableDiffusion3.5 
+               cd StableDiffusion3.5
               mkdir results

               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst
@@ -0,0 +1,474 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of the xDiT diffusion
+   inference performance documentation. See
+   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
+   version.
+
+.. _xdit-video-diffusion-2513:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+
+   The image runs a preview version of ROCm using the new `TheRock
+   <https://github.com/ROCm/TheRock>`__ build system and includes the following
+   components:
+
+   .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_data in docker.components.items() %}
+         * - `{{ component_name }} <{{ component_data.url }}>`_
+           - {{ component_data.version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models-2513:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in docker.supported_models %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+Performance measurements
+========================
+
+To evaluate performance, the `Performance results with AMD ROCm software
+<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in `Performance results with AMD ROCm
+   software
+   <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance
+   achievable by AMD Instinct GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   Once the image has been downloaded you can follow these steps to
+   run benchmarks and generate outputs.
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.js_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models-2513` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.js_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model {{ model.model_repo }} \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd /app/Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd /app/Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 50
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1 Kontext" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 30 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_path /app/Flux/cat.png \
+                  --model_type flux_kontext \
+                  --guidance_scale 2.5 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "FLUX.2" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 50 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_paths /app/Flux/cat.png \
+                  --model_type flux2 \
+                  --guidance_scale 4.0 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "stable-diffusion-3.5-large" %}
+               cd /app/StableDiffusion3.5
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
+                  --model {{ model.model_repo }} \
+                  --num_inference_steps 28 \
+                  --prompt "A capybara holding a sign that reads Hello World" \
+                  --use_torch_compile \
+                  --pipefusion_parallel_degree 4 \
+                  --use_cfg_parallel \
+                  --num_repetitions 50 \
+                  --dtype torch.float16 \
+                  --output_path results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
+to find documentation for previous releases of xDiT diffusion inference
+performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
@@ -15,33 +15,40 @@ benchmarking, see the version-specific documentation.
     - Components
     - Resources

-   * - ``rocm/pytorch-xdit:v25.13`` (latest)
-     - 
+   * - ``rocm/pytorch-xdit:v26.1``
+     -
       * TheRock 1728a81
-     - 
+     -
       * :doc:`Documentation <../../xdit-diffusion-inference>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v26.1/images/sha256-4e35ebcad47042a41389b992ecb3489b3b0a922e4c34c7a0dd1098733a3db513>`__
+
+   * - ``rocm/pytorch-xdit:v25.13``
+     -
+       * TheRock 1728a81
+     -
+       * :doc:`Documentation <xdit-25.13>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef>`__

   * - ``rocm/pytorch-xdit:v25.12``
-     - 
+     -
       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
       * TheRock 3e3f834
-     - 
+     -
       * :doc:`Documentation <xdit-25.12>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256>`__

   * - ``rocm/pytorch-xdit:v25.11``
-     - 
+     -
       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
       * TheRock 3e3f834
-     - 
+     -
       * :doc:`Documentation <xdit-25.11>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216>`__

   * - ``rocm/pytorch-xdit:v25.10``
-     - 
+     -
       * `ROCm 7.9.0 preview <https://rocm.docs.amd.com/en/7.9.0-preview/about/release-notes.html>`__
       * TheRock 7afbe45
-     - 
+     -
       * :doc:`Documentation <xdit-25.10>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e>`__
--- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
@@ -13,15 +13,10 @@ xDiT diffusion inference

   {% set docker = data.docker %}

-   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
-   a prebuilt, optimized environment based on `xDiT
-   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
-   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
-   and MI300X (gfx942) GPUs.
-
-   The image runs a preview version of ROCm using the new `TheRock
-   <https://github.com/ROCm/TheRock>`__ build system and includes the following
-   components:
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
+   benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:

   .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}

@@ -105,22 +100,6 @@ vary by model -- select one to get started.
       {% endfor %}
   {% endfor %}

-Performance measurements
-========================
-
-To evaluate performance, the `Performance results with AMD ROCm software
-<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in `Performance results with AMD ROCm
-   software
-   <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance
-   achievable by AMD Instinct GPUs or ROCm software.
-
 System validation
 =================

@@ -300,7 +279,7 @@ Run inference
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
                      --live-output
-                     
+
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
@@ -311,152 +290,15 @@ Run inference
            To run the benchmarks for {{ model.model }}, use the following command:

            .. code-block:: shell
-            {% if model.model == "Hunyuan Video" %}
-               cd /app/Hunyuanvideo
-               mkdir results

-               torchrun --nproc_per_node=8 run.py \
-                  --model {{ model.model_repo }} \
-                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
-                  --height 720 --width 1280 --num_frames 129 \
-                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
-                  --ulysses_degree 8 \
-                  --enable_tiling --enable_slicing \
-                  --use_torch_compile \
-                  --bench_output results
+               {{ model.benchmark_command
+                  | map('replace', '{model_repo}', model.model_repo)
+                  | map('trim')
+                  | join('\n               ') }}

-            {% endif %}
-            {% if model.model == "Wan2.1" %}
-               cd /app/Wan
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/Wan/run.py \
-                  --task i2v \
-                  --height 720 \
-                  --width 1280 \
-                  --model {{ model.model_repo }} \
-                  --img_file_path /app/Wan/i2v_input.JPG \
-                  --ulysses_degree 8 \
-                  --seed 42 \
-                  --num_frames 81 \
-                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
-                  --num_repetitions 1 \
-                  --num_inference_steps 40 \
-                  --use_torch_compile
-
-            {% endif %}
-            {% if model.model == "Wan2.2" %}
-               cd /app/Wan
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/Wan/run.py \
-                  --task i2v \
-                  --height 720 \
-                  --width 1280 \
-                  --model {{ model.model_repo }} \
-                  --img_file_path /app/Wan/i2v_input.JPG \
-                  --ulysses_degree 8 \
-                  --seed 42 \
-                  --num_frames 81 \
-                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
-                  --num_repetitions 1 \
-                  --num_inference_steps 40 \
-                  --use_torch_compile
-
-            {% endif %}
-
-            {% if model.model == "FLUX.1" %}
-               cd /app/Flux
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/Flux/run.py \
-                  --model {{ model.model_repo }} \
-                  --seed 42 \
-                  --prompt "A small cat" \
-                  --height 1024 \
-                  --width 1024 \
-                  --num_inference_steps 25 \
-                  --max_sequence_length 256 \
-                  --warmup_steps 5 \
-                  --no_use_resolution_binning \
-                  --ulysses_degree 8 \
-                  --use_torch_compile \
-                  --num_repetitions 50
-
-            {% endif %}
-
-            {% if model.model == "FLUX.1 Kontext" %}
-               cd /app/Flux
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
-                  --model {{ model.model_repo }} \
-                  --seed 42 \
-                  --prompt "Add a cool hat to the cat" \
-                  --height 1024 \
-                  --width 1024 \
-                  --num_inference_steps 30 \
-                  --max_sequence_length 512 \
-                  --warmup_steps 5 \
-                  --no_use_resolution_binning \
-                  --ulysses_degree 8 \
-                  --use_torch_compile \
-                  --img_file_path /app/Flux/cat.png \
-                  --model_type flux_kontext \
-                  --guidance_scale 2.5 \
-                  --num_repetitions 25
-
-            {% endif %}
-
-            {% if model.model == "FLUX.2" %}
-               cd /app/Flux
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
-                  --model {{ model.model_repo }} \
-                  --seed 42 \
-                  --prompt "Add a cool hat to the cat" \
-                  --height 1024 \
-                  --width 1024 \
-                  --num_inference_steps 50 \
-                  --max_sequence_length 512 \
-                  --warmup_steps 5 \
-                  --no_use_resolution_binning \
-                  --ulysses_degree 8 \
-                  --use_torch_compile \
-                  --img_file_paths /app/Flux/cat.png \
-                  --model_type flux2 \
-                  --guidance_scale 4.0 \
-                  --num_repetitions 25
-
-            {% endif %}
-
-            {% if model.model == "stable-diffusion-3.5-large" %}
-               cd /app/StableDiffusion3.5 
-               mkdir results
-
-               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
-                  --model {{ model.model_repo }} \
-                  --num_inference_steps 28 \
-                  --prompt "A capybara holding a sign that reads Hello World" \
-                  --use_torch_compile \
-                  --pipefusion_parallel_degree 4 \
-                  --use_cfg_parallel \
-                  --num_repetitions 50 \
-                  --dtype torch.float16 \
-                  --output_path results
-
-            {% endif %}
-
-            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
+            The generated video will be stored under the results directory.

            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}

      {% endfor %}
    {% endfor %}
-
-Previous versions
-=================
-
-See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
-of xDiT diffusion inference performance testing.