From 3a43bacddacf03c161fe046706ea3f91d7248f59 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Mon, 22 Dec 2025 11:05:32 -0500 Subject: [PATCH] Update xdit diffusion inference history (#5808) * Update xdit diffusion inference history * fix --- .../xdit_25.11-inference-models.yaml | 109 +++++ .../previous-versions/vllm-history.rst | 2 +- .../previous-versions/xdit-25.10.rst | 8 +- .../previous-versions/xdit-25.11.rst | 389 ++++++++++++++++++ .../previous-versions/xdit-history.rst | 38 +- .../training/benchmark-docker/jax-maxtext.rst | 2 - 6 files changed, 514 insertions(+), 34 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml new file mode 100644 index 000000000..e88b4ef0b --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml @@ -0,0 +1,109 @@ +xdit_diffusion_inference: + docker: + - version: v25-11 + pull_tag: rocm/pytorch-xdit:v25.11 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.10.0 + supported_models: + - group: Hunyuan Video + models: + - Hunyuan Video + - group: Wan-AI + models: + - Wan2.1 + - Wan2.2 + - group: FLUX + models: + - FLUX.1 + whats_new: + - "Minor bug fixes and clarifications to READMEs." + - "Bumps TheRock, AITER, Diffusers, xDiT versions." + - "Changes Aiter rounding mode for faster gfx942 FWD Attention." + components: + TheRock: 3e3f834 + rccl: d23d18f + composable_kernel: 2570462 + rocm-libraries: 0588f07 + rocm-systems: 473025a + torch: 73adac + torchvision: f5c6c2e + triton: 7416ffc + accelerate: 34c1779 + aiter: de14bec + diffusers: 40528e9 + xfuser: 83978b5 + yunchang: 2c9b712 + + - version: v25-10 + pull_tag: rocm/pytorch-xdit:v25.10 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.9.0 + supported_models: + - group: Hunyuan Video + models: + - Hunyuan Video + - group: Wan-AI + models: + - Wan2.1 + - Wan2.2 + - group: FLUX + models: + - FLUX.1 + whats_new: + - "First official xDiT Docker Release for Diffusion Inference." + - "Supports gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X)." + - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads." + components: + TheRock: 7afbe45 + rccl: 9b04b2a + composable_kernel: b7a806f + rocm-libraries: f104555 + rocm-systems: 25922d0 + torch: 2.10.0a0+gite9c9017 + torchvision: 0.22.0a0+966da7e + triton: 3.5.0+git52e49c12 + accelerate: 1.11.0.dev0 + aiter: 0.1.5.post4.dev20+ga25e55e79 + diffusers: 0.36.0.dev0 + xfuser: 0.4.4 + yunchang: 0.6.3.post1 + + model_groups: + - group: Hunyuan Video + tag: hunyuan + models: + - model: Hunyuan Video + page_tag: hunyuan_tag + model_name: hunyuanvideo + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + mad_tag: pyt_xdit_hunyuanvideo + - group: Wan-AI + tag: wan + models: + - model: Wan2.1 + page_tag: wan_21_tag + model_name: wan2_1-i2v-14b-720p + model_repo: Wan-AI/Wan2.1-I2V-14B-720P + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + github: https://github.com/Wan-Video/Wan2.1 + mad_tag: pyt_xdit_wan_2_1 + - model: Wan2.2 + page_tag: wan_22_tag + model_name: wan2_2-i2v-a14b + model_repo: Wan-AI/Wan2.2-I2V-A14B + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + github: https://github.com/Wan-Video/Wan2.2 + mad_tag: pyt_xdit_wan_2_2 + - group: FLUX + tag: flux + models: + - model: FLUX.1 + page_tag: flux_1_tag + model_name: FLUX.1-dev + model_repo: black-forest-labs/FLUX.1-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index c1e50d7e3..43e5b0dfc 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -31,7 +31,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub ` + * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006`` diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst index 9bbbd84a9..5e41675ce 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst @@ -9,7 +9,7 @@ xDiT diffusion inference ************************ -.. _xdit-video-diffusion: +.. _xdit-video-diffusion-2510: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml @@ -152,7 +152,7 @@ run benchmarks and generate outputs. {% endfor %} {% endfor %} -.. _xdit-video-diffusion-setup: +.. _xdit-video-diffusion-setup-2510: Prepare the model ----------------- @@ -160,7 +160,7 @@ Prepare the model .. note:: If you're using ROCm MAD to :ref:`run your model - `, you can skip this section. MAD will handle + `, you can skip this section. MAD will handle starting the container and downloading required models inside the container. You can either use an existing Hugging Face cache or download the model fresh inside the container. @@ -255,7 +255,7 @@ You can either use an existing Hugging Face cache or download the model fresh in {% endfor %} {% endfor %} -.. _xdit-video-diffusion-run: +.. _xdit-video-diffusion-run-2510: Run inference ============= diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst new file mode 100644 index 000000000..8804f6497 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst @@ -0,0 +1,389 @@ +:orphan: + +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark + +************************ +xDiT diffusion inference +************************ + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See + :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest + version. + +.. _xdit-video-diffusion-2511: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X) GPUs. + The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ + and includes the following components: + + .. dropdown:: Software components + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. +For preview and development releases, see `amdsiloai/pytorch-xdit `_. + +What's new +========== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + {% for item in docker.whats_new %} + * {{ item }} + {% endfor %} + +.. _xdit-video-diffusion-supported-models-2511: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups %} + + {# Create a lookup for supported models #} + {% set supported_lookup = {} %} + {% for supported in docker.supported_models %} + {% set _ = supported_lookup.update({supported.group: supported.models}) %} + {% endfor %} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} + {% if model_group.group in supported_lookup %} +
{{ model_group.group }}
+ {% endif %} + {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% if model_group.group in supported_lookup %} + {% set supported_models = supported_lookup[model_group.group] %} + {% set models = model_group.models %} + {% for model in models %} + {% if model.model in supported_models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} +
+
+
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.page_tag }} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + + For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +Once the image has been downloaded you can follow these steps to +run benchmarks and generate outputs. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.page_tag}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models-2511` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.page_tag}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/app/huggingface_models + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml + + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.page_tag }} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + .. tab-item:: Standalone benchmarking + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + torchrun --nproc_per_node=8 run.py \ + --model tencent/HunyuanVideo \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan2.1 + mkdir results + torchrun --nproc_per_node=8 run.py \ + --task i2v-14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ + --image "/app/Wan2.1/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan2.2 + mkdir results + torchrun --nproc_per_node=8 run.py \ + --task i2v-A14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ + --image "/app/Wan2.2/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + {% if model.model == "FLUX.1" %} + cd Flux + mkdir results + torchrun --nproc_per_node=8 /app/Flux/run.py \ + --model black-forest-labs/FLUX.1-dev \ + --seed 42 \ + --prompt "A small cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 25 \ + --max_sequence_length 256 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --num_repetitions 1 \ + --benchmark_output_directory results + {% endif %} + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %} + {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} + {% endfor %} + {% endfor %} + +Previous versions +================= + +See +:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history` +to find documentation for previous releases of xDiT diffusion inference +performance testing. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst index 28609ae59..dd6158857 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst @@ -15,42 +15,26 @@ benchmarking, see the version-specific documentation. - Components - Resources - * - ``rocm/pytorch-xdit:v25.11`` (latest) + * - ``rocm/pytorch-xdit:v25.12`` (latest) - * `ROCm 7.10.0 preview `__ * TheRock 3e3f834 - * rccl d23d18f - * composable_kernel 2570462 - * rocm-libraries 0588f07 - * rocm-systems 473025a - * torch 73adac - * torchvision f5c6c2e - * triton 7416ffc - * accelerate 34c1779 - * aiter de14bec - * diffusers 40528e9 - * xfuser 83978b5 - * yunchang 2c9b712 - * :doc:`Documentation <../../xdit-diffusion-inference>` - * `Docker Hub `__ + * `Docker Hub `__ + + * - ``rocm/pytorch-xdit:v25.11`` + - + * `ROCm 7.10.0 preview `__ + * TheRock 3e3f834 + - + * :doc:`Documentation ` + * `Docker Hub `__ * - ``rocm/pytorch-xdit:v25.10`` - * `ROCm 7.9.0 preview `__ * TheRock 7afbe45 - * rccl 9b04b2a - * composable_kernel b7a806f - * rocm-libraries f104555 - * rocm-systems 25922d0 - * torch 2.10.0a0+gite9c9017 - * torchvision 0.22.0a0+966da7e - * triton 3.5.0+git52e49c12 - * accelerate 1.11.0.dev0 - * aiter 0.1.5.post4.dev20+ga25e55e79 - * diffusers 0.36.0.dev0 - * xfuser 0.4.4 - * yunchang 0.6.3.post1 - * :doc:`Documentation ` - * `Docker Hub `__ + * `Docker Hub `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst index c205b2962..1903b39d4 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst @@ -322,8 +322,6 @@ benchmark results: sbatch -N {{ model.multinode_training_script }} - .. _maxtext-rocprofv3: - .. rubric:: Profiling with rocprofv3 If you need to collect a trace and the JAX profiler isn't working, use ``rocprofv3`` provided by the :doc:`ROCprofiler-SDK ` as a workaround. For example: