diff --git a/.wordlist.txt b/.wordlist.txt index 4e7bf9634..a29867301 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -138,6 +138,7 @@ ESXi EP EoS etcd +equalto fas FBGEMM FiLM @@ -226,6 +227,8 @@ href Hyperparameters HybridEngine Huggingface +Hunyuan +HunyuanVideo IB ICD ICT @@ -541,6 +544,7 @@ UAC UC UCC UCX +ud UE UIF UMC @@ -852,6 +856,7 @@ pallas parallelization parallelizing param +params parameterization passthrough pe @@ -898,6 +903,7 @@ querySelectorAll queueing qwen radeon +rc rccl rdc rdma @@ -959,6 +965,7 @@ scalability scalable scipy seealso +selectattr selectedTag sendmsg seqs @@ -1062,6 +1069,8 @@ writebacks wrreq wzo xargs +xdit +xDiT xGMI xPacked xz diff --git a/docs/conf.py b/docs/conf.py index 497148972..6ce2cfcd6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -145,6 +145,7 @@ article_pages = [ {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml new file mode 100644 index 000000000..4aee5d0d5 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml @@ -0,0 +1,55 @@ +xdit_diffusion_inference: + docker: + pull_tag: rocm/pytorch-xdit:v25.10 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.9.0 + components: + TheRock: 7afbe45 + rccl: 9b04b2a + composable_kernel: b7a806f + rocm-libraries: f104555 + rocm-systems: 25922d0 + torch: 2.10.0a0+gite9c9017 + torchvision: 0.22.0a0+966da7e + triton: 3.5.0+git52e49c12 + accelerate: 1.11.0.dev0 + aiter: 0.1.5.post4.dev20+ga25e55e79 + diffusers: 0.36.0.dev0 + xfuser: 0.4.4 + yunchang: 0.6.3.post1 + + model_groups: + - group: Hunyuan Video + tag: hunyuan + models: + - model: Hunyuan Video + model_name: hunyuanvideo + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + mad_tag: pyt_xdit_hunyuanvideo + - group: Wan-AI + tag: wan + models: + - model: Wan2.1 + model_name: wan2_1-i2v-14b-720p + model_repo: Wan-AI/Wan2.1-I2V-14B-720P + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + github: https://github.com/Wan-Video/Wan2.1 + mad_tag: pyt_xdit_wan_2_1 + - model: Wan2.2 + model_name: wan2_2-i2v-a14b + model_repo: Wan-AI/Wan2.2-I2V-A14B + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + github: https://github.com/Wan-Video/Wan2.2 + mad_tag: pyt_xdit_wan_2_2 + - group: FLUX + tag: flux + models: + - model: FLUX.1 + model_name: FLUX.1-dev + model_repo: black-forest-labs/FLUX.1-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml new file mode 100644 index 000000000..e88b4ef0b --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -0,0 +1,109 @@ +xdit_diffusion_inference: + docker: + - version: v25-11 + pull_tag: rocm/pytorch-xdit:v25.11 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.10.0 + supported_models: + - group: Hunyuan Video + models: + - Hunyuan Video + - group: Wan-AI + models: + - Wan2.1 + - Wan2.2 + - group: FLUX + models: + - FLUX.1 + whats_new: + - "Minor bug fixes and clarifications to READMEs." + - "Bumps TheRock, AITER, Diffusers, xDiT versions." + - "Changes Aiter rounding mode for faster gfx942 FWD Attention." + components: + TheRock: 3e3f834 + rccl: d23d18f + composable_kernel: 2570462 + rocm-libraries: 0588f07 + rocm-systems: 473025a + torch: 73adac + torchvision: f5c6c2e + triton: 7416ffc + accelerate: 34c1779 + aiter: de14bec + diffusers: 40528e9 + xfuser: 83978b5 + yunchang: 2c9b712 + + - version: v25-10 + pull_tag: rocm/pytorch-xdit:v25.10 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + ROCm: 7.9.0 + supported_models: + - group: Hunyuan Video + models: + - Hunyuan Video + - group: Wan-AI + models: + - Wan2.1 + - Wan2.2 + - group: FLUX + models: + - FLUX.1 + whats_new: + - "First official xDiT Docker Release for Diffusion Inference." + - "Supports gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X)." + - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads." + components: + TheRock: 7afbe45 + rccl: 9b04b2a + composable_kernel: b7a806f + rocm-libraries: f104555 + rocm-systems: 25922d0 + torch: 2.10.0a0+gite9c9017 + torchvision: 0.22.0a0+966da7e + triton: 3.5.0+git52e49c12 + accelerate: 1.11.0.dev0 + aiter: 0.1.5.post4.dev20+ga25e55e79 + diffusers: 0.36.0.dev0 + xfuser: 0.4.4 + yunchang: 0.6.3.post1 + + model_groups: + - group: Hunyuan Video + tag: hunyuan + models: + - model: Hunyuan Video + page_tag: hunyuan_tag + model_name: hunyuanvideo + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + mad_tag: pyt_xdit_hunyuanvideo + - group: Wan-AI + tag: wan + models: + - model: Wan2.1 + page_tag: wan_21_tag + model_name: wan2_1-i2v-14b-720p + model_repo: Wan-AI/Wan2.1-I2V-14B-720P + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P + github: https://github.com/Wan-Video/Wan2.1 + mad_tag: pyt_xdit_wan_2_1 + - model: Wan2.2 + page_tag: wan_22_tag + model_name: wan2_2-i2v-a14b + model_repo: Wan-AI/Wan2.2-I2V-A14B + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B + github: https://github.com/Wan-Video/Wan2.2 + mad_tag: pyt_xdit_wan_2_2 + - group: FLUX + tag: flux + models: + - model: FLUX.1 + page_tag: flux_1_tag + model_name: FLUX.1-dev + model_repo: black-forest-labs/FLUX.1-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst new file mode 100644 index 000000000..aa38b65fa --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst @@ -0,0 +1,396 @@ +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark + +************************ +xDiT diffusion inference +************************ + +.. _xdit-video-diffusion: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers + a prebuilt, optimized inference environment based on `xDiT + `_ for benchmarking diffusion-based + video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, + and MI300X (gfx942) GPUs. + This image is based on ROCm {{docker.ROCm}} preview release via `TheRock `_ + and includes the following software components: + + .. tab-set:: + + .. tab-item:: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. +For preview and development releases, see `amdsiloai/pytorch-xdit `_. + +What's new +========== + +- Initial ROCm-enabled xDiT Docker release for diffusion inference. +- Supported architectures: gfx942 and gfx950 (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X). +- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models. + +.. _xdit-video-diffusion-supported-models: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length == 1 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA +auto-balancing, you can skip this step. Otherwise, complete the procedures in +the `System validation and optimization +`__ +guide to properly configure your system settings before starting. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker %} + + For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +Once the image has been downloaded you can follow these steps to +run benchmarks and generate outputs. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set model_groups = data.xdit_diffusion_inference.model_groups %} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.mad_tag}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. + + {% endfor %} + {% endfor %} + +.. _xdit-video-diffusion-setup: + +Prepare the model +----------------- + +.. note:: + + If you're using ROCm MAD to :ref:`run your model + `, you can skip this section. MAD will handle + starting the container and downloading required models inside the container. + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{model.mad_tag}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/app/huggingface_models + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +.. _xdit-video-diffusion-run: + +Run inference +============= + +You can benchmark models through `MAD `__-integrated automation or standalone +torchrun commands. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml + + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + .. tab-item:: Standalone benchmarking + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model tencent/HunyuanVideo \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan2.1 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ + --image "/app/Wan2.1/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan2.2 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-A14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ + --image "/app/Wan2.2/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + + {% if model.model == "FLUX.1" %} + cd Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run.py \ + --model black-forest-labs/FLUX.1-dev \ + --seed 42 \ + --prompt "A small cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 25 \ + --max_sequence_length 256 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --num_repetitions 1 \ + --benchmark_output_directory results + + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %} + + {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} + + {% endfor %} + {% endfor %} + +Further reading +=============== + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- For a list of other ready-made Docker images for AI with ROCm, see `AMD + Infinity Hub + `__. + +Previous versions +================= + +See :doc:`xdit-history` to find documentation for previous releases +of xDiT diffusion inference performance testing. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst new file mode 100644 index 000000000..a93c66c1e --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst @@ -0,0 +1,56 @@ +:orphan: + +************************************************************ +xDiT diffusion inference performance testing version history +************************************************************ + +This table lists previous versions of the ROCm xDiT diffusion inference performance +testing environment. For detailed information about available models for +benchmarking, see the version-specific documentation. + +.. list-table:: + :header-rows: 1 + + * - Docker image tag + - Components + - Resources + + * - ``rocm/pytorch-xdit:v25.11`` (latest) + - + * ROCm 7.10.0 preview + * TheRock 3e3f834 + * rccl d23d18f + * composable_kernel 2570462 + * rocm-libraries 0588f07 + * rocm-systems 473025a + * torch 73adac + * torchvision f5c6c2e + * triton 7416ffc + * accelerate 34c1779 + * aiter de14bec + * diffusers 40528e9 + * xfuser 83978b5 + * yunchang 2c9b712 + - + * :doc:`Documentation <../../xdit-diffusion-inference>` + * `Docker Hub `__ + + * - ``rocm/pytorch-xdit:v25.10`` + - + * ROCm 7.9.0 preview + * TheRock 7afbe45 + * rccl 9b04b2a + * composable_kernel b7a806f + * rocm-libraries f104555 + * rocm-systems 25922d0 + * torch 2.10.0a0+gite9c9017 + * torchvision 0.22.0a0+966da7e + * triton 3.5.0+git52e49c12 + * accelerate 1.11.0.dev0 + * aiter 0.1.5.post4.dev20+ga25e55e79 + * diffusers 0.36.0.dev0 + * xfuser 0.4.4 + * yunchang 0.6.3.post1 + - + * :doc:`Documentation ` + * `Docker Hub `__ diff --git a/docs/how-to/rocm-for-ai/inference/index.rst b/docs/how-to/rocm-for-ai/inference/index.rst index 6eb705141..f12054b59 100644 --- a/docs/how-to/rocm-for-ai/inference/index.rst +++ b/docs/how-to/rocm-for-ai/inference/index.rst @@ -27,3 +27,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram - :doc:`SGLang inference performance testing ` - :doc:`Deploying your model ` + +- :doc:`xDiT diffusion inference ` + diff --git a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst new file mode 100644 index 000000000..6e71d8431 --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst @@ -0,0 +1,388 @@ +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark + +************************ +xDiT diffusion inference +************************ + +.. _xdit-video-diffusion: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X) GPUs. + The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ + and includes the following components: + + .. dropdown:: Software components + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. +For preview and development releases, see `amdsiloai/pytorch-xdit `_. + +What's new +========== +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + {% for item in docker.whats_new %} + * {{ item }} + {% endfor %} + +.. _xdit-video-diffusion-supported-models: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups %} + + {# Create a lookup for supported models #} + {% set supported_lookup = {} %} + {% for supported in docker.supported_models %} + {% set _ = supported_lookup.update({supported.group: supported.models}) %} + {% endfor %} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} + {% if model_group.group in supported_lookup %} +
{{ model_group.group }}
+ {% endif %} + {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% if model_group.group in supported_lookup %} + {% set supported_models = supported_lookup[model_group.group] %} + {% set models = model_group.models %} + {% for model in models %} + {% if model.model in supported_models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} +
+
+
+ + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.page_tag }} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + + For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +Once the image has been downloaded you can follow these steps to +run benchmarks and generate outputs. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.page_tag}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %} + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{model.page_tag}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/app/huggingface_models + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml + + {% set model_groups = data.xdit_diffusion_inference.model_groups%} + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.page_tag }} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + .. tab-item:: Standalone benchmarking + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model tencent/HunyuanVideo \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan2.1 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \ + --image "/app/Wan2.1/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan2.2 + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --task i2v-A14B \ + --size 720*1280 --frame_num 81 \ + --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \ + --image "/app/Wan2.2/examples/i2v_input.JPG" \ + --ulysses_size 8 --ring_size 1 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \ + --offload_model 0 \ + --vae_dtype bfloat16 \ + --allow_tf32 \ + --compile + {% endif %} + + {% if model.model == "FLUX.1" %} + cd Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run.py \ + --model black-forest-labs/FLUX.1-dev \ + --seed 42 \ + --prompt "A small cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 25 \ + --max_sequence_length 256 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --num_repetitions 1 \ + --benchmark_output_directory results + + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %} + + {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} + + {% endfor %} + {% endfor %} + +Previous versions +================= + +See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases +of xDiT diffusion inference performance testing. \ No newline at end of file diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 9f0d3954c..d46a111b6 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -117,6 +117,8 @@ subtrees: title: SGLang inference performance testing - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst title: SGLang distributed inference with Mooncake + - file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst + title: xDiT diffusion inference - file: how-to/rocm-for-ai/inference/deploy-your-model.rst title: Deploy your model