From fe8dff691d48c7d40037aca79367726048e37e0e Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Wed, 11 Feb 2026 13:27:36 -0500 Subject: [PATCH] Update docs for xDiT diffusion inference 26.1 (#5955) * archive previous version * xDiT diffusion inference docker 26.1 --- .../xdit_25.13-inference-models.yaml | 105 ++++ .../inference/xdit-inference-models.yaml | 196 +++++++- .../previous-versions/xdit-25.12.rst | 6 +- .../previous-versions/xdit-25.13.rst | 474 ++++++++++++++++++ .../previous-versions/xdit-history.rst | 25 +- .../inference/xdit-diffusion-inference.rst | 178 +------ 6 files changed, 795 insertions(+), 189 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml new file mode 100644 index 000000000..67aadcd5a --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml @@ -0,0 +1,105 @@ +docker: + pull_tag: rocm/pytorch-xdit:v25.13 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef + ROCm: 7.11.0 + whats_new: + - "Flux.1 Kontext support" + - "Flux.2 Dev support" + - "Flux FP8 GEMM support" + - "Hybrid FP8 attention support for Wan models" + components: + TheRock: + version: 1728a81 + url: https://github.com/ROCm/TheRock + rccl: + version: d23d18f + url: https://github.com/ROCm/rccl + composable_kernel: + version: ab0101c + url: https://github.com/ROCm/composable_kernel + rocm-libraries: + version: a2f7c35 + url: https://github.com/ROCm/rocm-libraries + rocm-systems: + version: 659737c + url: https://github.com/ROCm/rocm-systems + torch: + version: 91be249 + url: https://github.com/ROCm/pytorch + torchvision: + version: b919bd0 + url: https://github.com/pytorch/vision + triton: + version: a272dfa + url: https://github.com/ROCm/triton + accelerate: + version: b521400f + url: https://github.com/huggingface/accelerate + aiter: + version: de14bec0 + url: https://github.com/ROCm/aiter + diffusers: + version: a1f36ee3e + url: https://github.com/huggingface/diffusers + xfuser: + version: adf2681 + url: https://github.com/xdit-project/xDiT + yunchang: + version: 2c9b712 + url: https://github.com/feifeibear/long-context-attention + supported_models: + - group: Hunyuan Video + js_tag: hunyuan + models: + - model: Hunyuan Video + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + mad_tag: pyt_xdit_hunyuanvideo + js_tag: hunyuan_tag + - group: Wan-AI + js_tag: wan + models: + - model: Wan2.1 + model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers + github: https://github.com/Wan-Video/Wan2.1 + mad_tag: pyt_xdit_wan_2_1 + js_tag: wan_21_tag + - model: Wan2.2 + model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers + github: https://github.com/Wan-Video/Wan2.2 + mad_tag: pyt_xdit_wan_2_2 + js_tag: wan_22_tag + - group: FLUX + js_tag: flux + models: + - model: FLUX.1 + model_repo: black-forest-labs/FLUX.1-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux + js_tag: flux_1_tag + - model: FLUX.1 Kontext + model_repo: black-forest-labs/FLUX.1-Kontext-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux_kontext + js_tag: flux_1_kontext_tag + - model: FLUX.2 + model_repo: black-forest-labs/FLUX.2-dev + url: https://huggingface.co/black-forest-labs/FLUX.2-dev + github: https://github.com/black-forest-labs/flux2 + mad_tag: pyt_xdit_flux_2 + js_tag: flux_2_tag + - group: StableDiffusion + js_tag: stablediffusion + models: + - model: stable-diffusion-3.5-large + model_repo: stabilityai/stable-diffusion-3.5-large + url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large + github: https://github.com/Stability-AI/sd3.5 + mad_tag: pyt_xdit_sd_3_5 + js_tag: stable_diffusion_3_5_large_tag diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml index 67aadcd5a..8866b060d 100644 --- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -1,14 +1,13 @@ docker: - pull_tag: rocm/pytorch-xdit:v25.13 - docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef + pull_tag: rocm/pytorch-xdit:v26.1 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit ROCm: 7.11.0 whats_new: - - "Flux.1 Kontext support" - - "Flux.2 Dev support" - - "Flux FP8 GEMM support" - - "Hybrid FP8 attention support for Wan models" + - "HunyuanVideo 1.5 support" + - "Z-Image Turbo support" + - "Wan model sharding" components: - TheRock: + TheRock: version: 1728a81 url: https://github.com/ROCm/TheRock rccl: @@ -39,10 +38,10 @@ docker: version: de14bec0 url: https://github.com/ROCm/aiter diffusers: - version: a1f36ee3e + version: 6708f5 url: https://github.com/huggingface/diffusers xfuser: - version: adf2681 + version: 0a3d7a url: https://github.com/xdit-project/xDiT yunchang: version: 2c9b712 @@ -58,6 +57,49 @@ docker: github: https://github.com/Tencent-Hunyuan/HunyuanVideo mad_tag: pyt_xdit_hunyuanvideo js_tag: hunyuan_tag + benchmark_command: + - cd /app/Hunyuanvideo + - mkdir results + - 'torchrun --nproc_per_node=8 run.py \' + - '--model {model_repo} \' + - '--prompt "In the large cage, two puppies were wagging their tails at each other." \' + - '--batch_size 1 \' + - '--height 720 --width 1280 \' + - '--seed 1168860793 \' + - '--num_frames 129 \' + - '--num_inference_steps 50 \' + - '--warmup_steps 1 \' + - '--n_repeats 1 \' + - '--sleep_dur 10 \' + - '--ulysses_degree 8 \' + - '--enable_tiling --enable_slicing \' + - '--guidance_scale 6.0 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' + - model: Hunyuan Video 1.5 + model_repo: hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v + url: https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v + github: https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5 + mad_tag: pyt_xdit_hunyuanvideo_1_5 + js_tag: hunyuan_1_5_tag + benchmark_command: + - cd /app/Hunyuanvideo_1_5 + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Hunyuanvideo_1_5/run.py \' + - '--model {model_repo} \' + - '--prompt "In the large cage, two puppies were wagging their tails at each other." \' + - '--task t2v \' + - '--height 720 --width 1280 \' + - '--seed 1168860793 \' + - '--num_frames 129 \' + - '--num_inference_steps 50 \' + - '--num_repetitions 1 \' + - '--ulysses_degree 8 \' + - '--enable_tiling --enable_slicing \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: Wan-AI js_tag: wan models: @@ -67,12 +109,48 @@ docker: github: https://github.com/Wan-Video/Wan2.1 mad_tag: pyt_xdit_wan_2_1 js_tag: wan_21_tag + benchmark_command: + - cd /app/Wan + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Wan/run.py \' + - '--model {model_repo} \' + - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \' + - '--task i2v \' + - '--height 720 \' + - '--width 1280 \' + - '--img_file_path /app/Wan/i2v_input.JPG \' + - '--num_frames 81 \' + - '--ulysses_degree 8 \' + - '--seed 42 \' + - '--num_repetitions 1 \' + - '--num_inference_steps 40 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: Wan2.2 model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers github: https://github.com/Wan-Video/Wan2.2 mad_tag: pyt_xdit_wan_2_2 js_tag: wan_22_tag + benchmark_command: + - cd /app/Wan + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Wan/run.py \' + - '--model {model_repo} \' + - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \' + - '--task i2v \' + - '--height 720 \' + - '--width 1280 \' + - '--img_file_path /app/Wan/i2v_input.JPG \' + - '--num_frames 81 \' + - '--ulysses_degree 8 \' + - '--seed 42 \' + - '--num_repetitions 1 \' + - '--num_inference_steps 40 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: FLUX js_tag: flux models: @@ -82,18 +160,79 @@ docker: github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux js_tag: flux_1_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "A small cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 25 \' + - '--max_sequence_length 256 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--guidance_scale 0.0 \' + - '--num_repetitions 50 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: FLUX.1 Kontext model_repo: black-forest-labs/FLUX.1-Kontext-dev url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux_kontext js_tag: flux_1_kontext_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "Add a cool hat to the cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 30 \' + - '--max_sequence_length 512 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--img_file_path /app/Flux/cat.png \' + - '--model_type flux_kontext \' + - '--guidance_scale 2.5 \' + - '--num_repetitions 25 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: FLUX.2 model_repo: black-forest-labs/FLUX.2-dev url: https://huggingface.co/black-forest-labs/FLUX.2-dev github: https://github.com/black-forest-labs/flux2 mad_tag: pyt_xdit_flux_2 js_tag: flux_2_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "Add a cool hat to the cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 50 \' + - '--max_sequence_length 512 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--img_file_paths /app/Flux/cat.png \' + - '--model_type flux2 \' + - '--guidance_scale 4.0 \' + - '--num_repetitions 25 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: StableDiffusion js_tag: stablediffusion models: @@ -103,3 +242,42 @@ docker: github: https://github.com/Stability-AI/sd3.5 mad_tag: pyt_xdit_sd_3_5 js_tag: stable_diffusion_3_5_large_tag + benchmark_command: + - cd /app/StableDiffusion3.5 + - mkdir results + - 'torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \' + - '--model {model_repo} \' + - '--prompt "A capybara holding a sign that reads Hello World" \' + - '--num_repetitions 50 \' + - '--num_inference_steps 28 \' + - '--pipefusion_parallel_degree 4 \' + - '--use_cfg_parallel \' + - '--use_torch_compile \' + - '--dtype torch.float16 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' + - group: Z-Image + js_tag: z_image + models: + - model: Z-Image Turbo + model_repo: Tongyi-MAI/Z-Image-Turbo + url: https://huggingface.co/Tongyi-MAI/Z-Image-Turbo + github: https://github.com/Tongyi-MAI/Z-Image + mad_tag: pyt_xdit_z_image_turbo + js_tag: z_image_turbo_tag + benchmark_command: + - cd /app/Z-Image + - mkdir results + - 'torchrun --nproc_per_node=2 /app/Z-Image/run.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "A crowded beach" \' + - '--height 1088 \' + - '--width 1920 \' + - '--num_inference_steps 9 \' + - '--ulysses_degree 2 \' + - '--use_torch_compile \' + - '--guidance_scale 0.0 \' + - '--num_repetitions 50 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst index 66d279e4e..62b0be1c5 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst @@ -11,7 +11,7 @@ xDiT diffusion inference .. caution:: - This documentation does not reflect the latest version of ROCm vLLM + This documentation does not reflect the latest version of xDiT diffusion inference performance documentation. See :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest version. @@ -293,7 +293,7 @@ Run inference --tags {{model.mad_tag}} \ --keep-model-dir \ --live-output - + MAD launches a Docker container with the name ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` @@ -379,7 +379,7 @@ Run inference {% endif %} {% if model.model == "stable-diffusion-3.5-large" %} - cd StableDiffusion3.5 + cd StableDiffusion3.5 mkdir results torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst new file mode 100644 index 000000000..22a514f78 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13.rst @@ -0,0 +1,474 @@ +:orphan: + +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark + +************************ +xDiT diffusion inference +************************ + +.. caution:: + + This documentation does not reflect the latest version of the xDiT diffusion + inference performance documentation. See + :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest + version. + +.. _xdit-video-diffusion-2513: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers + a prebuilt, optimized environment based on `xDiT + `_ for benchmarking diffusion model + video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, + and MI300X (gfx942) GPUs. + + The image runs a preview version of ROCm using the new `TheRock + `__ build system and includes the following + components: + + .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_data in docker.components.items() %} + * - `{{ component_name }} <{{ component_data.url }}>`_ + - {{ component_data.version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. +For preview and development releases, see `amdsiloai/pytorch-xdit `_. + +What's new +========== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + {% for item in docker.whats_new %} + * {{ item }} + {% endfor %} + +.. _xdit-video-diffusion-supported-models-2513: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in docker.supported_models %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in docker.supported_models %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.js_tag }} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +Performance measurements +======================== + +To evaluate performance, the `Performance results with AMD ROCm software +`__ +page provides reference throughput and serving measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in `Performance results with AMD ROCm + software + `__ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance + achievable by AMD Instinct GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + Once the image has been downloaded you can follow these steps to + run benchmarks and generate outputs. + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.js_tag}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models-2513` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + .. container:: model-doc {{model.js_tag}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/app/huggingface_models + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.13-inference-models.yaml + + {% set docker = data.docker %} + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.js_tag }} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + .. tab-item:: Standalone benchmarking + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model {{ model.model_repo }} \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + + {% endif %} + {% if model.model == "Wan2.1" %} + cd /app/Wan + mkdir results + + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + + {% endif %} + {% if model.model == "Wan2.2" %} + cd /app/Wan + mkdir results + + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + + {% endif %} + + {% if model.model == "FLUX.1" %} + cd /app/Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "A small cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 25 \ + --max_sequence_length 256 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --num_repetitions 50 + + {% endif %} + + {% if model.model == "FLUX.1 Kontext" %} + cd /app/Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "Add a cool hat to the cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 30 \ + --max_sequence_length 512 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --img_file_path /app/Flux/cat.png \ + --model_type flux_kontext \ + --guidance_scale 2.5 \ + --num_repetitions 25 + + {% endif %} + + {% if model.model == "FLUX.2" %} + cd /app/Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "Add a cool hat to the cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 50 \ + --max_sequence_length 512 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --img_file_paths /app/Flux/cat.png \ + --model_type flux2 \ + --guidance_scale 4.0 \ + --num_repetitions 25 + + {% endif %} + + {% if model.model == "stable-diffusion-3.5-large" %} + cd /app/StableDiffusion3.5 + mkdir results + + torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ + --model {{ model.model_repo }} \ + --num_inference_steps 28 \ + --prompt "A capybara holding a sign that reads Hello World" \ + --use_torch_compile \ + --pipefusion_parallel_degree 4 \ + --use_cfg_parallel \ + --num_repetitions 50 \ + --dtype torch.float16 \ + --output_path results + + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} + + {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} + + {% endfor %} + {% endfor %} + +Previous versions +================= + +See +:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history` +to find documentation for previous releases of xDiT diffusion inference +performance testing. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst index efc54fed5..81c032112 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst @@ -15,33 +15,40 @@ benchmarking, see the version-specific documentation. - Components - Resources - * - ``rocm/pytorch-xdit:v25.13`` (latest) - - + * - ``rocm/pytorch-xdit:v26.1`` + - * TheRock 1728a81 - - + - * :doc:`Documentation <../../xdit-diffusion-inference>` + * `Docker Hub `__ + + * - ``rocm/pytorch-xdit:v25.13`` + - + * TheRock 1728a81 + - + * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/pytorch-xdit:v25.12`` - - + - * `ROCm 7.10.0 preview `__ * TheRock 3e3f834 - - + - * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/pytorch-xdit:v25.11`` - - + - * `ROCm 7.10.0 preview `__ * TheRock 3e3f834 - - + - * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/pytorch-xdit:v25.10`` - - + - * `ROCm 7.9.0 preview `__ * TheRock 7afbe45 - - + - * :doc:`Documentation ` * `Docker Hub `__ diff --git a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst index c22c7df46..ac7dfc67d 100644 --- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst +++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst @@ -13,15 +13,10 @@ xDiT diffusion inference {% set docker = data.docker %} - The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers - a prebuilt, optimized environment based on `xDiT - `_ for benchmarking diffusion model - video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, - and MI300X (gfx942) GPUs. - - The image runs a preview version of ROCm using the new `TheRock - `__ build system and includes the following - components: + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X) GPUs. + The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ + and includes the following components: .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }} @@ -105,22 +100,6 @@ vary by model -- select one to get started. {% endfor %} {% endfor %} -Performance measurements -======================== - -To evaluate performance, the `Performance results with AMD ROCm software -`__ -page provides reference throughput and serving measurements for inferencing popular AI models. - -.. important:: - - The performance data presented in `Performance results with AMD ROCm - software - `__ - only reflects the latest version of this inference benchmarking environment. - The listed measurements should not be interpreted as the peak performance - achievable by AMD Instinct GPUs or ROCm software. - System validation ================= @@ -300,7 +279,7 @@ Run inference --tags {{model.mad_tag}} \ --keep-model-dir \ --live-output - + MAD launches a Docker container with the name ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` @@ -311,152 +290,15 @@ Run inference To run the benchmarks for {{ model.model }}, use the following command: .. code-block:: shell - {% if model.model == "Hunyuan Video" %} - cd /app/Hunyuanvideo - mkdir results - torchrun --nproc_per_node=8 run.py \ - --model {{ model.model_repo }} \ - --prompt "In the large cage, two puppies were wagging their tails at each other." \ - --height 720 --width 1280 --num_frames 129 \ - --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ - --ulysses_degree 8 \ - --enable_tiling --enable_slicing \ - --use_torch_compile \ - --bench_output results + {{ model.benchmark_command + | map('replace', '{model_repo}', model.model_repo) + | map('trim') + | join('\n ') }} - {% endif %} - {% if model.model == "Wan2.1" %} - cd /app/Wan - mkdir results - - torchrun --nproc_per_node=8 /app/Wan/run.py \ - --task i2v \ - --height 720 \ - --width 1280 \ - --model {{ model.model_repo }} \ - --img_file_path /app/Wan/i2v_input.JPG \ - --ulysses_degree 8 \ - --seed 42 \ - --num_frames 81 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --num_repetitions 1 \ - --num_inference_steps 40 \ - --use_torch_compile - - {% endif %} - {% if model.model == "Wan2.2" %} - cd /app/Wan - mkdir results - - torchrun --nproc_per_node=8 /app/Wan/run.py \ - --task i2v \ - --height 720 \ - --width 1280 \ - --model {{ model.model_repo }} \ - --img_file_path /app/Wan/i2v_input.JPG \ - --ulysses_degree 8 \ - --seed 42 \ - --num_frames 81 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --num_repetitions 1 \ - --num_inference_steps 40 \ - --use_torch_compile - - {% endif %} - - {% if model.model == "FLUX.1" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "A small cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 25 \ - --max_sequence_length 256 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --num_repetitions 50 - - {% endif %} - - {% if model.model == "FLUX.1 Kontext" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "Add a cool hat to the cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 30 \ - --max_sequence_length 512 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --img_file_path /app/Flux/cat.png \ - --model_type flux_kontext \ - --guidance_scale 2.5 \ - --num_repetitions 25 - - {% endif %} - - {% if model.model == "FLUX.2" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "Add a cool hat to the cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 50 \ - --max_sequence_length 512 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --img_file_paths /app/Flux/cat.png \ - --model_type flux2 \ - --guidance_scale 4.0 \ - --num_repetitions 25 - - {% endif %} - - {% if model.model == "stable-diffusion-3.5-large" %} - cd /app/StableDiffusion3.5 - mkdir results - - torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ - --model {{ model.model_repo }} \ - --num_inference_steps 28 \ - --prompt "A capybara holding a sign that reads Hello World" \ - --use_torch_compile \ - --pipefusion_parallel_degree 4 \ - --use_cfg_parallel \ - --num_repetitions 50 \ - --dtype torch.float16 \ - --output_path results - - {% endif %} - - The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} + The generated video will be stored under the results directory. {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} {% endfor %} {% endfor %} - -Previous versions -================= - -See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases -of xDiT diffusion inference performance testing.