From 4132a2609c49ba2ef735539d816dcaa87b1ccae7 Mon Sep 17 00:00:00 2001 From: Kristoffer Date: Mon, 27 Oct 2025 14:56:55 +0100 Subject: [PATCH] Add xdit diffusion docs (#5576) * Add xdit video diffusion base page. * Update supported accelerators. * Remove dependency on mad-tags. * Update docker pull section. * Update container launch instructions. * Improve launch instruction options and layout. * Add benchmark result outputs. * Fix wrong HunyuanVideo path * Finalize instructions. * Consistent title. * Make page and side-bar titles the same. * Updated wordlist. Removed note container reg HF. * Remove fp8_gemms in command and add release notes. * Update accelerators naming. * Add note regarding OOB performance. * Fix admonition box. * Overall fixes. --- .wordlist.txt | 6 + docs/conf.py | 1 + .../inference/xdit-inference-models.yaml | 38 +++ docs/how-to/rocm-for-ai/inference/index.rst | 4 +- .../inference/xdit-video-diffusion.rst | 322 ++++++++++++++++++ docs/sphinx/_toc.yml.in | 2 + 6 files changed, 372 insertions(+), 1 deletion(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst diff --git a/.wordlist.txt b/.wordlist.txt index aed9dc1cc..68185fbe9 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -220,6 +220,7 @@ href Hyperparameters HybridEngine Huggingface +Hunyuan IB ICD ICT @@ -531,6 +532,7 @@ UAC UC UCC UCX +ud UE UIF UMC @@ -842,6 +844,7 @@ pallas parallelization parallelizing param +params parameterization passthrough pe @@ -888,6 +891,7 @@ querySelectorAll queueing qwen radeon +rc rccl rdc rdma @@ -1052,6 +1056,8 @@ writebacks wrreq wzo xargs +xdit +xDiT xGMI xPacked xz diff --git a/docs/conf.py b/docs/conf.py index 5a6298e04..85c6863ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -175,6 +175,7 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/xdit-video-diffusion", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml new file mode 100644 index 000000000..60f52aae7 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -0,0 +1,38 @@ +xdit_video_diffusion: + docker: + pull_tag: amdsiloai/pytorch-xdit:v25.9 + docker_hub_url: https://hub.docker.com/r/amdsiloai/pytorch-xdit + ROCm: 7.0.0rc + components: + TheRock: 7afbe45 + rccl: 9b04b2a + composable_kernel: b7a806f + rocm-libraries: f104555 + rocm-systems: 25922d0 + torch: 2.10.0a0+git3caf6da + torchvision: 0.22.0a0+966da7e + triton: 3.5.0+gitea06d636 + + model_groups: + - group: Hunyuan Video + tag: hunyuan + models: + - model: Hunyuan Video + model_name: hunyuanvideo + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + - group: Wan-AI + tag: wan + models: + - model: Wan2.1 + model_name: wan2.1_i2v_14b_720p + model_repo: Wan-AI/Wan2.1-I2V-14B-720P + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + github: https://github.com/Wan-Video/Wan2.1 + - model: Wan2.2 + model_name: wan2.2-i2v-a14b + model_repo: Wan-AI/Wan2.2-I2V-A14B + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + github: https://github.com/Wan-Video/Wan2.2 \ No newline at end of file diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst index 6eb705141..4f66fd82f 100644 --- a/docs/how-to/rocm-for-ai/inference/index.rst +++ b/docs/how-to/rocm-for-ai/inference/index.rst @@ -26,4 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram - :doc:`SGLang inference performance testing ` -- :doc:`Deploying your model ` +- :doc:`xDiT video inference ` + +- :doc:`Deploying your model ` \ No newline at end of file diff --git a/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst new file mode 100644 index 000000000..af98cc187 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/xdit-video-diffusion.rst @@ -0,0 +1,322 @@ +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, validate, benchmark + +******************** +xDiT video inference +******************** + +.. _xdit-video-diffusion: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + The `amdsiloai/pytorch-xdit Docker <{{ docker.docker_hub_url }}>`_ image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video generation on + AMD Instinctâ„¢ MI355X, MI350X (gfx950), and MI300X GPUs. + The image runs ROCm `{{docker.ROCm}}` based on `TheRock `_ + and includes the following components: + + .. tab-set:: + + .. tab-item:: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. + +What's new +========== + +- Initial release +- ROCm: 7.0.0rc +- Added support for AMD Instinctâ„¢ MI355X, MI350X (gfx950), and MI300X (gfx942) GPUs. +- Added support for Wan 2.1, Wan 2.2 and Hunyuan Video models with MIOpen optimizations. + +.. _xdit-video-diffusion-supported-models: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.model_name}} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + + For this tutorial, it's recommended to use the ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +Once the image has been downloaded you can follow these steps to +run benchmarks and generate a video. + +.. warning:: + + If your host/OS ROCm installation is below 6.4.2 (see with ``apt show rocm-libs``) you need to export + the ``HSA_NO_SCRATCH_RECLAIM=1`` environment variable inside the container, or the workload will crash. + If possible, ask your system administrator to upgrade ROCm. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.model_name}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_video_diffusion.docker %} + {% set model_groups = data.xdit_video_diffusion.model_groups%} + + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{model.model_name}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set model_groups = data.xdit_video_diffusion.model_groups%} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.model_name }} + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model tencent/HunyuanVideo \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan2.1 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ + --image "/app/Wan2.1/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan2.2 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-A14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ + --image "/app/Wan2.2/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 + {% endif %} + + {% if model.model in ["Wan2.1", "Wan2.2"] %} + For additional performance improvements, consider adding the ``--compile`` flag to the above command. Note that this can significantly increase startup time on the first call. + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% endif %} + + {% endfor %} + {% endfor %} + +Known limitations +================= + +- OOB tuning: Currently only Instinct MI300X has been tuned for in the gfx942 + series. Other gfx942 GPUs might not perform optimally out-of-the-box. \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a0a5084ff..253f4416f 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -117,6 +117,8 @@ subtrees: title: SGLang inference performance testing - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst title: SGLang distributed inference with Mooncake + - file: how-to/rocm-for-ai/inference/xdit-video-diffusion.rst + title: xDiT video inference - file: how-to/rocm-for-ai/inference/deploy-your-model.rst title: Deploy your model