[docs/7.9.0] Add xDiT diffusion inference doc (#5676)

This commit is contained in:
peterjunpark
2025-11-18 16:56:00 -05:00
committed by GitHub
parent cdbcad6930
commit e76255db98
6 changed files with 667 additions and 1 deletions

View File

@@ -216,6 +216,7 @@ Higgs
href
Hyperparameters
Huggingface
HunyuanVideo
IB
ICD
ICT
@@ -1046,6 +1047,8 @@ writebacks
wrreq
wzo
xargs
xDiT
xdit
xGMI
xPacked
xz

View File

@@ -115,6 +115,7 @@ all_article_info_author = ""
# pages with specific settings
article_pages = [
{"file": "about/release-notes", "date": GA_DATE},
{"file": "rocm-for-ai/xdit-diffusion-inference", "os": ["linux"]},
]
external_toc_path = "./sphinx/_toc.yml"
@@ -136,9 +137,9 @@ extensions = [
"rocm_docs_custom.selector",
"rocm_docs_custom.table",
"rocm_docs_custom.icon",
"sphinxcontrib.datatemplates",
# "sphinx_reredirects",
# "sphinx_sitemap",
# "sphinxcontrib.datatemplates",
# "version-ref",
# "csv-to-list-table",
]
@@ -170,6 +171,9 @@ html_theme_options = {
}
}
html_title = f"AMD ROCm {ROCM_VERSION} preview"
html_static_path = ["sphinx/static/css", "sphinx/static/js"]
html_css_files = ["vllm-benchmark.css"]
html_js_files = ["vllm-benchmark.js"]
numfig = False
suppress_warnings = ["autosectionlabel.*"]

View File

@@ -0,0 +1,55 @@
xdit_diffusion_inference:
docker:
pull_tag: rocm/pytorch-xdit:v25.10
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
ROCm: 7.9.0
components:
TheRock: 7afbe45
rccl: 9b04b2a
composable_kernel: b7a806f
rocm-libraries: f104555
rocm-systems: 25922d0
torch: 2.10.0a0+gite9c9017
torchvision: 0.22.0a0+966da7e
triton: 3.5.0+git52e49c12
accelerate: 1.11.0.dev0
aiter: 0.1.5.post4.dev20+ga25e55e79
diffusers: 0.36.0.dev0
xfuser: 0.4.4
yunchang: 0.6.3.post1
model_groups:
- group: Hunyuan Video
tag: hunyuan
models:
- model: Hunyuan Video
model_name: hunyuanvideo
model_repo: tencent/HunyuanVideo
revision: refs/pr/18
url: https://huggingface.co/tencent/HunyuanVideo
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
mad_tag: pyt_xdit_hunyuanvideo
- group: Wan-AI
tag: wan
models:
- model: Wan2.1
model_name: wan2_1-i2v-14b-720p
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
github: https://github.com/Wan-Video/Wan2.1
mad_tag: pyt_xdit_wan_2_1
- model: Wan2.2
model_name: wan2_2-i2v-a14b
model_repo: Wan-AI/Wan2.2-I2V-A14B
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
github: https://github.com/Wan-Video/Wan2.2
mad_tag: pyt_xdit_wan_2_2
- group: FLUX
tag: flux
models:
- model: FLUX.1
model_name: FLUX.1-dev
model_repo: black-forest-labs/FLUX.1-dev
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
github: https://github.com/black-forest-labs/flux
mad_tag: pyt_xdit_flux

View File

@@ -0,0 +1,390 @@
.. meta::
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
prebuilt and optimized docker images.
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
************************
xDiT diffusion inference
************************
.. _xdit-video-diffusion:
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set docker = data.xdit_diffusion_inference.docker %}
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
a prebuilt, optimized inference environment based on `xDiT
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion-based
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
and MI300X (gfx942) GPUs.
This image is based on ROCm {{docker.ROCm}} preview release via `TheRock <https://github.com/ROCm/TheRock>`_
and includes the following software components:
.. tab-set::
.. tab-item:: {{ docker.pull_tag }}
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
What's new
==========
- Initial ROCm-enabled xDiT Docker release for diffusion inference.
- Supported architectures: gfx942 and gfx950 (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X).
- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models.
.. _xdit-video-diffusion-supported-models:
Supported models
================
The following models are supported for inference performance benchmarking.
Some instructions, commands, and recommendations in this documentation might
vary by model -- select one to get started.
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set docker = data.xdit_diffusion_inference.docker %}
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length == 1 %}
<div class="col-12 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
.. note::
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
external license agreement through a third party.
{% endfor %}
{% endfor %}
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA
auto-balancing, you can skip this step. Otherwise, complete the procedures in
the `System validation and optimization
<https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.html>`__
guide to properly configure your system settings before starting.
Pull the Docker image
=====================
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set docker = data.xdit_diffusion_inference.docker %}
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
Pull the image using the following command:
.. code-block:: shell
docker pull {{ docker.pull_tag }}
Validate and benchmark
======================
Once the image has been downloaded you can follow these steps to
run benchmarks and generate outputs.
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set model_groups = data.xdit_diffusion_inference.model_groups %}
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
The following commands are written for {{ model.model }}.
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
{% endfor %}
{% endfor %}
.. _xdit-video-diffusion-setup:
Prepare the model
-----------------
.. note::
If you're using ROCm MAD to :ref:`run your model
<xdit-video-diffusion-run>`, you can skip this section. MAD will handle
starting the container and downloading required models inside the container.
You can either use an existing Hugging Face cache or download the model fresh inside the container.
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set docker = data.xdit_diffusion_inference.docker %}
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{model.mad_tag}}
.. tab-set::
.. tab-item:: Option 1: Use existing Hugging Face cache
If you already have models downloaded on your host system, you can mount your existing cache.
1. Set your Hugging Face cache location.
.. code-block:: shell
export HF_HOME=/your/hf_cache/location
2. Download the model (if not already cached).
.. code-block:: shell
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
3. Launch the container with mounted cache.
.. code-block:: shell
docker run \
-it --rm \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--user root \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--ipc=host \
--network host \
--privileged \
--shm-size 128G \
--name pytorch-xdit \
-e HSA_NO_SCRATCH_RECLAIM=1 \
-e OMP_NUM_THREADS=16 \
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-e HF_HOME=/app/huggingface_models \
-v $HF_HOME:/app/huggingface_models \
{{ docker.pull_tag }}
.. tab-item:: Option 2: Download inside container
If you prefer to keep the container self-contained or don't have an existing cache.
1. Launch the container
.. code-block:: shell
docker run \
-it --rm \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--user root \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--ipc=host \
--network host \
--privileged \
--shm-size 128G \
--name pytorch-xdit \
-e HSA_NO_SCRATCH_RECLAIM=1 \
-e OMP_NUM_THREADS=16 \
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
{{ docker.pull_tag }}
2. Inside the container, set the Hugging Face cache location and download the model.
.. code-block:: shell
export HF_HOME=/app/huggingface_models
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
.. warning::
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
{% endfor %}
{% endfor %}
.. _xdit-video-diffusion-run:
Run inference
=============
You can benchmark models through `MAD <https://github.com/ROCm/MAD>`__-integrated automation or standalone
torchrun commands.
.. datatemplate:yaml:: /data/rocm-for-ai/xdit-inference-models.yaml
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. On the host machine, use this command to run the performance benchmark test on
the `{{model.model}} <{{ model.url }}>`_ model using one node.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{model.mad_tag}} \
--keep-model-dir \
--live-output
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
and ``{{ model.mad_tag }}_serving.csv``.
.. tab-item:: Standalone benchmarking
To run the benchmarks for {{ model.model }}, use the following command:
.. code-block:: shell
{% if model.model == "Hunyuan Video" %}
cd /app/Hunyuanvideo
mkdir results
torchrun --nproc_per_node=8 run.py \
--model tencent/HunyuanVideo \
--prompt "In the large cage, two puppies were wagging their tails at each other." \
--height 720 --width 1280 --num_frames 129 \
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
--ulysses_degree 8 \
--enable_tiling --enable_slicing \
--use_torch_compile \
--bench_output results
{% endif %}
{% if model.model == "Wan2.1" %}
cd Wan2.1
mkdir results
torchrun --nproc_per_node=8 run.py \
--task i2v-14B \
--size 720*1280 --frame_num 81 \
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
--image "/app/Wan2.1/examples/i2v_input.JPG" \
--ulysses_size 8 --ring_size 1 \
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
--offload_model 0 \
--vae_dtype bfloat16 \
--allow_tf32 \
--compile
{% endif %}
{% if model.model == "Wan2.2" %}
cd Wan2.2
mkdir results
torchrun --nproc_per_node=8 run.py \
--task i2v-A14B \
--size 720*1280 --frame_num 81 \
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
--image "/app/Wan2.2/examples/i2v_input.JPG" \
--ulysses_size 8 --ring_size 1 \
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
--offload_model 0 \
--vae_dtype bfloat16 \
--allow_tf32 \
--compile
{% endif %}
{% if model.model == "FLUX.1" %}
cd Flux
mkdir results
torchrun --nproc_per_node=8 /app/Flux/run.py \
--model black-forest-labs/FLUX.1-dev \
--seed 42 \
--prompt "A small cat" \
--height 1024 \
--width 1024 \
--num_inference_steps 25 \
--max_sequence_length 256 \
--warmup_steps 5 \
--no_use_resolution_binning \
--ulysses_degree 8 \
--use_torch_compile \
--num_repetitions 1 \
--benchmark_output_directory results
{% endif %}
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
{% endfor %}
{% endfor %}
Further reading
===============
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
- For a list of other ready-made Docker images for AI with ROCm, see `AMD
Infinity Hub
<https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`__.

View File

@@ -20,3 +20,5 @@ subtrees:
entries:
- file: install/pytorch-comfyui
title: Install PyTorch and ComfyUI
- file: rocm-for-ai/xdit-diffusion-inference
title: xDiT diffusion inference

View File

@@ -0,0 +1,212 @@
function ready(proc) {
// Check if page is loaded. If so, init.
if (document.readyState !== "loading") {
proc();
} else {
// Otherwise, wait for DOMContentLoaded event.
document.addEventListener("DOMContentLoaded", proc);
}
}
ready(() => {
const ModelPicker = {
// Selector strings for DOM elements
SELECTORS: {
CONTAINER: "#vllm-benchmark-ud-params-picker",
MODEL_GROUP_BTN: 'div[data-param-k="model-group"][data-param-v]',
MODEL_PARAM_BTN: 'div[data-param-k="model"][data-param-v]',
MODEL_DOC: "div.model-doc",
},
CSS_CLASSES: {
HIDDEN: "hidden",
},
ATTRIBUTES: {
PARAM_KEY: "data-param-k", // URL search parameter key (i.e., "model")
PARAM_VALUE: "data-param-v", // URL search param value (e.g., "pyt_vllm_llama-3.1-8b", "pyt_vllm_llama-3.1-70b") -- these are MAD model tags
PARAM_GROUP: "data-param-group", // Model group (e.g., "llama", "mistral")
PARAM_STATE: "data-param-state", // Selection state
},
// Cache DOM elements
elements: {
container: null,
modelGroups: null,
modelParams: null,
modelDocs: null,
},
data: {
availableModels: new Set(),
modelsByGroup: new Map(),
modelToGroupMap: new Map(),
formattedModelClassMap: new Map(), //TODO
},
init() {
this.elements.container = document.querySelector(
this.SELECTORS.CONTAINER,
);
if (!this.elements.container) return;
this.cacheDOMElements();
if (!this.validateElements()) return;
this.buildModelData();
this.bindEvents();
this.initializeState();
},
cacheDOMElements() {
const { CONTAINER, MODEL_GROUP_BTN, MODEL_PARAM_BTN, MODEL_DOC } =
this.SELECTORS;
this.elements = {
container: document.querySelector(CONTAINER),
modelGroups: document.querySelectorAll(MODEL_GROUP_BTN),
modelParams: document.querySelectorAll(MODEL_PARAM_BTN),
modelDocs: document.querySelectorAll(MODEL_DOC),
};
},
validateElements() {
const { modelGroups, modelParams } = this.elements;
if (!modelGroups.length || !modelParams.length) {
console.warn("Model picker is missing required elements");
return false;
}
return true;
},
buildModelData() {
const { PARAM_VALUE, PARAM_GROUP } = this.ATTRIBUTES;
this.elements.modelParams.forEach((model) => {
const modelTag = model.getAttribute(PARAM_VALUE);
const groupTag = model.getAttribute(PARAM_GROUP);
if (!modelTag || !groupTag) return;
this.data.availableModels.add(modelTag);
this.data.modelToGroupMap.set(modelTag, groupTag);
// FIXME: this is because Sphinx auto-formats class names to use dashes
this.data.formattedModelClassMap.set(
modelTag,
modelTag.replace(/[^a-zA-Z0-9]/g, "-"),
);
if (!this.data.modelsByGroup.has(groupTag)) {
this.data.modelsByGroup.set(groupTag, []);
}
this.data.modelsByGroup.get(groupTag).push(modelTag);
});
},
// Event listeners for user interactions
bindEvents() {
const handleInteraction = (event) => {
const target = event.target.closest(`[${this.ATTRIBUTES.PARAM_KEY}]`);
if (!target) return;
const paramType = target.getAttribute(this.ATTRIBUTES.PARAM_KEY);
const paramValue = target.getAttribute(this.ATTRIBUTES.PARAM_VALUE);
if (paramType === "model") {
const groupTag = target.getAttribute(this.ATTRIBUTES.PARAM_GROUP);
if (groupTag) this.updateUI(paramValue, groupTag);
} else if (paramType === "model-group") {
const firstModelInGroup = this.data.modelsByGroup.get(paramValue)
?.[0];
if (firstModelInGroup) this.updateUI(firstModelInGroup, paramValue);
}
};
this.elements.container.addEventListener("click", handleInteraction);
this.elements.container.addEventListener("keydown", (event) => {
if (event.key === "Enter" || event.key === " ") {
event.preventDefault();
handleInteraction(event);
}
});
},
// Update the page based on the selected model
updateUI(modelTag, groupTag) {
const validModel = this.setModelSearchParam(modelTag);
// Update model group buttons
this.elements.modelGroups.forEach((group) => {
const isSelected =
group.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === groupTag;
group.setAttribute(
this.ATTRIBUTES.PARAM_STATE,
isSelected ? "selected" : "",
);
group.setAttribute("aria-selected", isSelected.toString());
});
// Update model buttons
this.elements.modelParams.forEach((model) => {
const isInSelectedGroup =
model.getAttribute(this.ATTRIBUTES.PARAM_GROUP) === groupTag;
const isSelectedModel =
model.getAttribute(this.ATTRIBUTES.PARAM_VALUE) === validModel;
model.classList.toggle(this.CSS_CLASSES.HIDDEN, !isInSelectedGroup);
model.setAttribute(
this.ATTRIBUTES.PARAM_STATE,
isSelectedModel ? "selected" : "",
);
model.setAttribute("aria-selected", isSelectedModel.toString());
});
// Update visibility of doc sections
const formattedClass = this.data.formattedModelClassMap.get(validModel);
if (formattedClass) {
this.elements.modelDocs.forEach((doc) => {
doc.classList.toggle(
this.CSS_CLASSES.HIDDEN,
!doc.classList.contains(formattedClass),
);
});
}
},
// Get the current model from the URL search parameters.
getModelSearchParam() {
return new URLSearchParams(location.search).get("model");
},
// Set the model in the URL search parameters, or fallback to the first available one.
setModelSearchParam(modelTag) {
const defaultModel = [...this.data.availableModels][0];
const model = this.data.availableModels.has(modelTag)
? modelTag
: defaultModel;
const searchParams = new URLSearchParams(location.search);
searchParams.set("model", model);
history.replaceState(
{},
"",
`${location.pathname}?${searchParams.toString()}`,
);
return model;
},
// Initialize the UI state based on the current URL search parameter or default values.
initializeState() {
const currentModel = this.getModelSearchParam();
const validModel = this.setModelSearchParam(currentModel);
const initialGroup = this.data.modelToGroupMap.get(validModel) ??
[...this.data.modelsByGroup.keys()][0];
if (initialGroup) {
this.updateUI(validModel, initialGroup);
}
},
};
ModelPicker.init();
});