diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml index 67aadcd5a..8866b060d 100644 --- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -1,14 +1,13 @@ docker: - pull_tag: rocm/pytorch-xdit:v25.13 - docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef + pull_tag: rocm/pytorch-xdit:v26.1 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit ROCm: 7.11.0 whats_new: - - "Flux.1 Kontext support" - - "Flux.2 Dev support" - - "Flux FP8 GEMM support" - - "Hybrid FP8 attention support for Wan models" + - "HunyuanVideo 1.5 support" + - "Z-Image Turbo support" + - "Wan model sharding" components: - TheRock: + TheRock: version: 1728a81 url: https://github.com/ROCm/TheRock rccl: @@ -39,10 +38,10 @@ docker: version: de14bec0 url: https://github.com/ROCm/aiter diffusers: - version: a1f36ee3e + version: 6708f5 url: https://github.com/huggingface/diffusers xfuser: - version: adf2681 + version: 0a3d7a url: https://github.com/xdit-project/xDiT yunchang: version: 2c9b712 @@ -58,6 +57,49 @@ docker: github: https://github.com/Tencent-Hunyuan/HunyuanVideo mad_tag: pyt_xdit_hunyuanvideo js_tag: hunyuan_tag + benchmark_command: + - cd /app/Hunyuanvideo + - mkdir results + - 'torchrun --nproc_per_node=8 run.py \' + - '--model {model_repo} \' + - '--prompt "In the large cage, two puppies were wagging their tails at each other." \' + - '--batch_size 1 \' + - '--height 720 --width 1280 \' + - '--seed 1168860793 \' + - '--num_frames 129 \' + - '--num_inference_steps 50 \' + - '--warmup_steps 1 \' + - '--n_repeats 1 \' + - '--sleep_dur 10 \' + - '--ulysses_degree 8 \' + - '--enable_tiling --enable_slicing \' + - '--guidance_scale 6.0 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' + - model: Hunyuan Video 1.5 + model_repo: hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v + url: https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v + github: https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5 + mad_tag: pyt_xdit_hunyuanvideo_1_5 + js_tag: hunyuan_1_5_tag + benchmark_command: + - cd /app/Hunyuanvideo_1_5 + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Hunyuanvideo_1_5/run.py \' + - '--model {model_repo} \' + - '--prompt "In the large cage, two puppies were wagging their tails at each other." \' + - '--task t2v \' + - '--height 720 --width 1280 \' + - '--seed 1168860793 \' + - '--num_frames 129 \' + - '--num_inference_steps 50 \' + - '--num_repetitions 1 \' + - '--ulysses_degree 8 \' + - '--enable_tiling --enable_slicing \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: Wan-AI js_tag: wan models: @@ -67,12 +109,48 @@ docker: github: https://github.com/Wan-Video/Wan2.1 mad_tag: pyt_xdit_wan_2_1 js_tag: wan_21_tag + benchmark_command: + - cd /app/Wan + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Wan/run.py \' + - '--model {model_repo} \' + - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \' + - '--task i2v \' + - '--height 720 \' + - '--width 1280 \' + - '--img_file_path /app/Wan/i2v_input.JPG \' + - '--num_frames 81 \' + - '--ulysses_degree 8 \' + - '--seed 42 \' + - '--num_repetitions 1 \' + - '--num_inference_steps 40 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: Wan2.2 model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers github: https://github.com/Wan-Video/Wan2.2 mad_tag: pyt_xdit_wan_2_2 js_tag: wan_22_tag + benchmark_command: + - cd /app/Wan + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Wan/run.py \' + - '--model {model_repo} \' + - '--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline''s intricate details and the refreshing atmosphere of the seaside." \' + - '--task i2v \' + - '--height 720 \' + - '--width 1280 \' + - '--img_file_path /app/Wan/i2v_input.JPG \' + - '--num_frames 81 \' + - '--ulysses_degree 8 \' + - '--seed 42 \' + - '--num_repetitions 1 \' + - '--num_inference_steps 40 \' + - '--use_torch_compile \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: FLUX js_tag: flux models: @@ -82,18 +160,79 @@ docker: github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux js_tag: flux_1_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "A small cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 25 \' + - '--max_sequence_length 256 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--guidance_scale 0.0 \' + - '--num_repetitions 50 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: FLUX.1 Kontext model_repo: black-forest-labs/FLUX.1-Kontext-dev url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux_kontext js_tag: flux_1_kontext_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "Add a cool hat to the cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 30 \' + - '--max_sequence_length 512 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--img_file_path /app/Flux/cat.png \' + - '--model_type flux_kontext \' + - '--guidance_scale 2.5 \' + - '--num_repetitions 25 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - model: FLUX.2 model_repo: black-forest-labs/FLUX.2-dev url: https://huggingface.co/black-forest-labs/FLUX.2-dev github: https://github.com/black-forest-labs/flux2 mad_tag: pyt_xdit_flux_2 js_tag: flux_2_tag + benchmark_command: + - cd /app/Flux + - mkdir results + - 'torchrun --nproc_per_node=8 /app/Flux/run_usp.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "Add a cool hat to the cat" \' + - '--height 1024 \' + - '--width 1024 \' + - '--num_inference_steps 50 \' + - '--max_sequence_length 512 \' + - '--warmup_steps 5 \' + - '--no_use_resolution_binning \' + - '--ulysses_degree 8 \' + - '--use_torch_compile \' + - '--img_file_paths /app/Flux/cat.png \' + - '--model_type flux2 \' + - '--guidance_scale 4.0 \' + - '--num_repetitions 25 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' - group: StableDiffusion js_tag: stablediffusion models: @@ -103,3 +242,42 @@ docker: github: https://github.com/Stability-AI/sd3.5 mad_tag: pyt_xdit_sd_3_5 js_tag: stable_diffusion_3_5_large_tag + benchmark_command: + - cd /app/StableDiffusion3.5 + - mkdir results + - 'torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \' + - '--model {model_repo} \' + - '--prompt "A capybara holding a sign that reads Hello World" \' + - '--num_repetitions 50 \' + - '--num_inference_steps 28 \' + - '--pipefusion_parallel_degree 4 \' + - '--use_cfg_parallel \' + - '--use_torch_compile \' + - '--dtype torch.float16 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' + - group: Z-Image + js_tag: z_image + models: + - model: Z-Image Turbo + model_repo: Tongyi-MAI/Z-Image-Turbo + url: https://huggingface.co/Tongyi-MAI/Z-Image-Turbo + github: https://github.com/Tongyi-MAI/Z-Image + mad_tag: pyt_xdit_z_image_turbo + js_tag: z_image_turbo_tag + benchmark_command: + - cd /app/Z-Image + - mkdir results + - 'torchrun --nproc_per_node=2 /app/Z-Image/run.py \' + - '--model {model_repo} \' + - '--seed 42 \' + - '--prompt "A crowded beach" \' + - '--height 1088 \' + - '--width 1920 \' + - '--num_inference_steps 9 \' + - '--ulysses_degree 2 \' + - '--use_torch_compile \' + - '--guidance_scale 0.0 \' + - '--num_repetitions 50 \' + - '--attention_backend aiter \' + - '--benchmark_output_directory results' diff --git a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst index c22c7df46..ac7dfc67d 100644 --- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst +++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst @@ -13,15 +13,10 @@ xDiT diffusion inference {% set docker = data.docker %} - The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers - a prebuilt, optimized environment based on `xDiT - `_ for benchmarking diffusion model - video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, - and MI300X (gfx942) GPUs. - - The image runs a preview version of ROCm using the new `TheRock - `__ build system and includes the following - components: + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT `_ for + benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinctâ„¢ MI300X, MI325X, MI350X, and MI355X) GPUs. + The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ + and includes the following components: .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }} @@ -105,22 +100,6 @@ vary by model -- select one to get started. {% endfor %} {% endfor %} -Performance measurements -======================== - -To evaluate performance, the `Performance results with AMD ROCm software -`__ -page provides reference throughput and serving measurements for inferencing popular AI models. - -.. important:: - - The performance data presented in `Performance results with AMD ROCm - software - `__ - only reflects the latest version of this inference benchmarking environment. - The listed measurements should not be interpreted as the peak performance - achievable by AMD Instinct GPUs or ROCm software. - System validation ================= @@ -300,7 +279,7 @@ Run inference --tags {{model.mad_tag}} \ --keep-model-dir \ --live-output - + MAD launches a Docker container with the name ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` @@ -311,152 +290,15 @@ Run inference To run the benchmarks for {{ model.model }}, use the following command: .. code-block:: shell - {% if model.model == "Hunyuan Video" %} - cd /app/Hunyuanvideo - mkdir results - torchrun --nproc_per_node=8 run.py \ - --model {{ model.model_repo }} \ - --prompt "In the large cage, two puppies were wagging their tails at each other." \ - --height 720 --width 1280 --num_frames 129 \ - --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ - --ulysses_degree 8 \ - --enable_tiling --enable_slicing \ - --use_torch_compile \ - --bench_output results + {{ model.benchmark_command + | map('replace', '{model_repo}', model.model_repo) + | map('trim') + | join('\n ') }} - {% endif %} - {% if model.model == "Wan2.1" %} - cd /app/Wan - mkdir results - - torchrun --nproc_per_node=8 /app/Wan/run.py \ - --task i2v \ - --height 720 \ - --width 1280 \ - --model {{ model.model_repo }} \ - --img_file_path /app/Wan/i2v_input.JPG \ - --ulysses_degree 8 \ - --seed 42 \ - --num_frames 81 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --num_repetitions 1 \ - --num_inference_steps 40 \ - --use_torch_compile - - {% endif %} - {% if model.model == "Wan2.2" %} - cd /app/Wan - mkdir results - - torchrun --nproc_per_node=8 /app/Wan/run.py \ - --task i2v \ - --height 720 \ - --width 1280 \ - --model {{ model.model_repo }} \ - --img_file_path /app/Wan/i2v_input.JPG \ - --ulysses_degree 8 \ - --seed 42 \ - --num_frames 81 \ - --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ - --num_repetitions 1 \ - --num_inference_steps 40 \ - --use_torch_compile - - {% endif %} - - {% if model.model == "FLUX.1" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "A small cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 25 \ - --max_sequence_length 256 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --num_repetitions 50 - - {% endif %} - - {% if model.model == "FLUX.1 Kontext" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "Add a cool hat to the cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 30 \ - --max_sequence_length 512 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --img_file_path /app/Flux/cat.png \ - --model_type flux_kontext \ - --guidance_scale 2.5 \ - --num_repetitions 25 - - {% endif %} - - {% if model.model == "FLUX.2" %} - cd /app/Flux - mkdir results - - torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ - --model {{ model.model_repo }} \ - --seed 42 \ - --prompt "Add a cool hat to the cat" \ - --height 1024 \ - --width 1024 \ - --num_inference_steps 50 \ - --max_sequence_length 512 \ - --warmup_steps 5 \ - --no_use_resolution_binning \ - --ulysses_degree 8 \ - --use_torch_compile \ - --img_file_paths /app/Flux/cat.png \ - --model_type flux2 \ - --guidance_scale 4.0 \ - --num_repetitions 25 - - {% endif %} - - {% if model.model == "stable-diffusion-3.5-large" %} - cd /app/StableDiffusion3.5 - mkdir results - - torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ - --model {{ model.model_repo }} \ - --num_inference_steps 28 \ - --prompt "A capybara holding a sign that reads Hello World" \ - --use_torch_compile \ - --pipefusion_parallel_degree 4 \ - --use_cfg_parallel \ - --num_repetitions 50 \ - --dtype torch.float16 \ - --output_path results - - {% endif %} - - The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} + The generated video will be stored under the results directory. {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} {% endfor %} {% endfor %} - -Previous versions -================= - -See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases -of xDiT diffusion inference performance testing.