From 459283da3ce80e513fdc8fb583cddefcf51f6088 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Wed, 17 Dec 2025 10:20:10 -0500 Subject: [PATCH] xDiT diffusion inference v25.12 documentation update (#5786) * Add xdit-diffusion ROCm docs page. * Update template formatting and fix sphinx warnings * Add System Validation section. * Add sw component versions/commits. * Update to use latest v25.10 image instead of v25.9 * Update commands and add FLUX instructions. * Update Flux instructions. Change image tag. Describe as diffusion inference instead of specifically video. * git rm xdit-video-diffusion.rst * Docs for v25.12 * Add hyperlinks to components * Command fixes * -Diffusers suffix * Simplify yaml file and cleanup main rst page. * Spelling, added 'js' * fix merge conflict fix --------- Co-authored-by: Kristoffer --- .wordlist.txt | 1 + .../inference/xdit-inference-models.yaml | 154 ++++++++--------- .../previous-versions/xdit-25.10.rst | 2 + .../previous-versions/xdit-history.rst | 4 +- docs/how-to/rocm-for-ai/inference/index.rst | 3 +- .../inference/xdit-diffusion-inference.rst | 160 ++++++++++-------- 6 files changed, 160 insertions(+), 164 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index 0fee3d0cf..889606056 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -261,6 +261,7 @@ Ioffe JAX's JAXLIB Jinja +js JSON Jupyter KFD diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml index e88b4ef0b..8d462524a 100644 --- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -1,109 +1,91 @@ -xdit_diffusion_inference: - docker: - - version: v25-11 - pull_tag: rocm/pytorch-xdit:v25.11 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit - ROCm: 7.10.0 - supported_models: - - group: Hunyuan Video - models: - - Hunyuan Video - - group: Wan-AI - models: - - Wan2.1 - - Wan2.2 - - group: FLUX - models: - - FLUX.1 - whats_new: - - "Minor bug fixes and clarifications to READMEs." - - "Bumps TheRock, AITER, Diffusers, xDiT versions." - - "Changes Aiter rounding mode for faster gfx942 FWD Attention." - components: - TheRock: 3e3f834 - rccl: d23d18f - composable_kernel: 2570462 - rocm-libraries: 0588f07 - rocm-systems: 473025a - torch: 73adac - torchvision: f5c6c2e - triton: 7416ffc - accelerate: 34c1779 - aiter: de14bec - diffusers: 40528e9 - xfuser: 83978b5 - yunchang: 2c9b712 - - - version: v25-10 - pull_tag: rocm/pytorch-xdit:v25.10 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit - ROCm: 7.9.0 - supported_models: - - group: Hunyuan Video - models: - - Hunyuan Video - - group: Wan-AI - models: - - Wan2.1 - - Wan2.2 - - group: FLUX - models: - - FLUX.1 - whats_new: - - "First official xDiT Docker Release for Diffusion Inference." - - "Supports gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X)." - - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads." - components: - TheRock: 7afbe45 - rccl: 9b04b2a - composable_kernel: b7a806f - rocm-libraries: f104555 - rocm-systems: 25922d0 - torch: 2.10.0a0+gite9c9017 - torchvision: 0.22.0a0+966da7e - triton: 3.5.0+git52e49c12 - accelerate: 1.11.0.dev0 - aiter: 0.1.5.post4.dev20+ga25e55e79 - diffusers: 0.36.0.dev0 - xfuser: 0.4.4 - yunchang: 0.6.3.post1 - - model_groups: +docker: + pull_tag: rocm/pytorch-xdit:v25.12 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.10.0 + whats_new: + - "Adds T2V and TI2V support for Wan models." + - "Adds support for SD-3.5 T2I model." + components: + TheRock: + version: 3e3f834 + url: https://github.com/ROCm/TheRock + rccl: + version: d23d18f + url: https://github.com/ROCm/rccl + composable_kernel: + version: 2570462 + url: https://github.com/ROCm/composable_kernel + rocm-libraries: + version: 0588f07 + url: https://github.com/ROCm/rocm-libraries + rocm-systems: + version: 473025a + url: https://github.com/ROCm/rocm-systems + torch: + version: 73adac + url: https://github.com/pytorch/pytorch + torchvision: + version: f5c6c2e + url: https://github.com/pytorch/vision + triton: + version: 7416ffc + url: https://github.com/triton-lang/triton + accelerate: + version: 34c1779 + url: https://github.com/huggingface/accelerate + aiter: + version: de14bec + url: https://github.com/ROCm/aiter + diffusers: + version: 40528e9 + url: https://github.com/huggingface/diffusers + xfuser: + version: ccba9d5 + url: https://github.com/xdit-project/xDiT + yunchang: + version: 2c9b712 + url: https://github.com/feifeibear/long-context-attention + supported_models: - group: Hunyuan Video - tag: hunyuan + js_tag: hunyuan models: - model: Hunyuan Video - page_tag: hunyuan_tag - model_name: hunyuanvideo model_repo: tencent/HunyuanVideo revision: refs/pr/18 url: https://huggingface.co/tencent/HunyuanVideo github: https://github.com/Tencent-Hunyuan/HunyuanVideo mad_tag: pyt_xdit_hunyuanvideo + js_tag: hunyuan_tag - group: Wan-AI - tag: wan + js_tag: wan models: - model: Wan2.1 - page_tag: wan_21_tag - model_name: wan2_1-i2v-14b-720p - model_repo: Wan-AI/Wan2.1-I2V-14B-720P - url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers github: https://github.com/Wan-Video/Wan2.1 mad_tag: pyt_xdit_wan_2_1 + js_tag: wan_21_tag - model: Wan2.2 - page_tag: wan_22_tag - model_name: wan2_2-i2v-a14b - model_repo: Wan-AI/Wan2.2-I2V-A14B - url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers github: https://github.com/Wan-Video/Wan2.2 mad_tag: pyt_xdit_wan_2_2 + js_tag: wan_22_tag - group: FLUX - tag: flux + js_tag: flux models: - model: FLUX.1 - page_tag: flux_1_tag - model_name: FLUX.1-dev model_repo: black-forest-labs/FLUX.1-dev url: https://huggingface.co/black-forest-labs/FLUX.1-dev github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux + js_tag: flux_1_tag + - group: Stable Diffusion + js_tag: stablediffusion + models: + - model: stable-diffusion-3.5-large + model_repo: stabilityai/stable-diffusion-3.5-large + url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large + github: https://github.com/Stability-AI/sd3.5 + mad_tag: pyt_xdit_sd_3_5 + js_tag: stable_diffusion_3_5_large_tag diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst index 92c2e908a..9bbbd84a9 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst @@ -1,3 +1,5 @@ +:orphan: + .. meta:: :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using prebuilt and optimized docker images. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst index a93c66c1e..28609ae59 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst @@ -17,7 +17,7 @@ benchmarking, see the version-specific documentation. * - ``rocm/pytorch-xdit:v25.11`` (latest) - - * ROCm 7.10.0 preview + * `ROCm 7.10.0 preview `__ * TheRock 3e3f834 * rccl d23d18f * composable_kernel 2570462 @@ -37,7 +37,7 @@ benchmarking, see the version-specific documentation. * - ``rocm/pytorch-xdit:v25.10`` - - * ROCm 7.9.0 preview + * `ROCm 7.9.0 preview `__ * TheRock 7afbe45 * rccl 9b04b2a * composable_kernel b7a806f diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst index f12054b59..353c05b53 100644 --- a/docs/how-to/rocm-for-ai/inference/index.rst +++ b/docs/how-to/rocm-for-ai/inference/index.rst @@ -26,7 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram - :doc:`SGLang inference performance testing ` -- :doc:`Deploying your model ` - - :doc:`xDiT diffusion inference ` +- :doc:`Deploying your model ` diff --git a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst index 6e71d8431..b9b1da113 100644 --- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst +++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst @@ -11,11 +11,14 @@ xDiT diffusion inference .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} - {% set model_groups = data.xdit_diffusion_inference.model_groups%} + {% set docker = data.docker %} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers + a prebuilt, optimized environment based on `xDiT + `_ for benchmarking diffusion model + video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, + and MI300X (gfx942) GPUs. - The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT `_ for - benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X) GPUs. The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ and includes the following components: @@ -27,9 +30,9 @@ xDiT diffusion inference * - Software component - Version - {% for component_name, component_version in docker.components.items() %} - * - {{ component_name }} - - {{ component_version }} + {% for component_name, component_data in docker.components.items() %} + * - `{{ component_name }} <{{ component_data.url }}>`_ + - {{ component_data.version }} {% endfor %} Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. @@ -37,10 +40,10 @@ For preview and development releases, see `amdsiloai/pytorch-xdit
Model
- {% for model_group in model_groups %} - {% if model_group.group in supported_lookup %} -
{{ model_group.group }}
- {% endif %} + {% for model_group in docker.supported_models %} +
{{ model_group.group }}
{% endfor %}
@@ -83,29 +77,24 @@ vary by model -- select one to get started.
Variant
- {% for model_group in model_groups %} - {% if model_group.group in supported_lookup %} - {% set supported_models = supported_lookup[model_group.group] %} + {% for model_group in docker.supported_models %} {% set models = model_group.models %} {% for model in models %} - {% if model.model in supported_models %} {% if models|length % 3 == 0 %} -
{{ model.model }}
+
{{ model.model }}
{% else %} -
{{ model.model }}
- {% endif %} +
{{ model.model }}
{% endif %} {% endfor %} - {% endif %} {% endfor %}
- {% for model_group in model_groups %} + {% for model_group in docker.supported_models %} {% for model in model_group.models %} - .. container:: model-doc {{ model.page_tag }} + .. container:: model-doc {{ model.js_tag }} .. note:: @@ -136,7 +125,7 @@ Pull the Docker image .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set docker = data.docker %} For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. Pull the image using the following command: @@ -148,15 +137,17 @@ Pull the Docker image Validate and benchmark ====================== -Once the image has been downloaded you can follow these steps to -run benchmarks and generate outputs. - .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - {% for model_group in model_groups %} + {% set docker = data.docker %} + + Once the image has been downloaded you can follow these steps to + run benchmarks and generate outputs. + + {% for model_group in docker.supported_models %} {% for model in model_group.models %} - .. container:: model-doc {{model.page_tag}} + .. container:: model-doc {{model.js_tag}} The following commands are written for {{ model.model }}. See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. @@ -171,12 +162,11 @@ You can either use an existing Hugging Face cache or download the model fresh in .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} - {% set model_groups = data.xdit_diffusion_inference.model_groups%} + {% set docker = data.docker %} - {% for model_group in model_groups %} + {% for model_group in docker.supported_models %} {% for model in model_group.models %} - .. container:: model-doc {{model.page_tag}} + .. container:: model-doc {{model.js_tag}} .. tab-set:: @@ -264,11 +254,12 @@ Run inference .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml - {% set model_groups = data.xdit_diffusion_inference.model_groups%} - {% for model_group in model_groups %} + {% set docker = data.docker %} + + {% for model_group in docker.supported_models %} {% for model in model_group.models %} - .. container:: model-doc {{ model.page_tag }} + .. container:: model-doc {{ model.js_tag }} .. tab-set:: @@ -309,7 +300,7 @@ Run inference mkdir results torchrun --nproc_per_node=8 run.py \ - --model tencent/HunyuanVideo \ + --model {{ model.model_repo }} \ --prompt "In the large cage, two puppies were wagging their tails at each other." \ --height 720 --width 1280 --num_frames 129 \ --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ @@ -317,40 +308,45 @@ Run inference --enable_tiling --enable_slicing \ --use_torch_compile \ --bench_output results + {% endif %} {% if model.model == "Wan2.1" %} - cd Wan2.1 + cd Wan mkdir results - torchrun --nproc_per_node=8 run.py \ - --task i2v-14B \ - --size 720*1280 --frame_num 81 \ - --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ - --image "/app/Wan2.1/examples/i2v_input.JPG" \ - --ulysses_size 8 --ring_size 1 \ + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ - --offload_model 0 \ - --vae_dtype bfloat16 \ - --allow_tf32 \ - --compile + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + {% endif %} {% if model.model == "Wan2.2" %} - cd Wan2.2 + cd Wan mkdir results - torchrun --nproc_per_node=8 run.py \ - --task i2v-A14B \ - --size 720*1280 --frame_num 81 \ - --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ - --image "/app/Wan2.2/examples/i2v_input.JPG" \ - --ulysses_size 8 --ring_size 1 \ + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ - --offload_model 0 \ - --vae_dtype bfloat16 \ - --allow_tf32 \ - --compile + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + {% endif %} {% if model.model == "FLUX.1" %} @@ -358,7 +354,7 @@ Run inference mkdir results torchrun --nproc_per_node=8 /app/Flux/run.py \ - --model black-forest-labs/FLUX.1-dev \ + --model {{ model.model_repo }} \ --seed 42 \ --prompt "A small cat" \ --height 1024 \ @@ -369,12 +365,28 @@ Run inference --no_use_resolution_binning \ --ulysses_degree 8 \ --use_torch_compile \ - --num_repetitions 1 \ - --benchmark_output_directory results + --num_repetitions 50 {% endif %} - The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %} + {% if model.model == "stable-diffusion-3.5-large" %} + cd StableDiffusion3.5 + mkdir results + + torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ + --model {{ model.model_repo }} \ + --num_inference_steps 28 \ + --prompt "A capybara holding a sign that reads Hello World" \ + --use_torch_compile \ + --pipefusion_parallel_degree 4 \ + --use_cfg_parallel \ + --num_repetitions 50 \ + --dtype torch.float16 \ + --output_path results + + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} @@ -385,4 +397,4 @@ Previous versions ================= See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases -of xDiT diffusion inference performance testing. \ No newline at end of file +of xDiT diffusion inference performance testing.