Update docs for xDiT diffusion inference 25.13 Docker release (#5820)

* archive previous version

* add xdit 25.13

* update history index

* add perf results section
This commit is contained in:
peterjunpark
2025-12-29 08:44:45 -05:00
committed by GitHub
parent e0b8ec4dfb
commit c67fac78bd
8 changed files with 619 additions and 30 deletions

View File

@@ -194,6 +194,10 @@ article_pages = [
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},

View File

@@ -1,7 +1,7 @@
xdit_diffusion_inference:
docker:
pull_tag: rocm/pytorch-xdit:v25.10
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e
ROCm: 7.9.0
components:
TheRock: 7afbe45

View File

@@ -2,7 +2,7 @@ xdit_diffusion_inference:
docker:
- version: v25-11
pull_tag: rocm/pytorch-xdit:v25.11
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216
ROCm: 7.10.0
supported_models:
- group: Hunyuan Video

View File

@@ -0,0 +1,91 @@
docker:
pull_tag: rocm/pytorch-xdit:v25.12
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256
ROCm: 7.10.0
whats_new:
- "Adds T2V and TI2V support for Wan models."
- "Adds support for SD-3.5 T2I model."
components:
TheRock:
version: 3e3f834
url: https://github.com/ROCm/TheRock
rccl:
version: d23d18f
url: https://github.com/ROCm/rccl
composable_kernel:
version: 2570462
url: https://github.com/ROCm/composable_kernel
rocm-libraries:
version: 0588f07
url: https://github.com/ROCm/rocm-libraries
rocm-systems:
version: 473025a
url: https://github.com/ROCm/rocm-systems
torch:
version: 73adac
url: https://github.com/pytorch/pytorch
torchvision:
version: f5c6c2e
url: https://github.com/pytorch/vision
triton:
version: 7416ffc
url: https://github.com/triton-lang/triton
accelerate:
version: 34c1779
url: https://github.com/huggingface/accelerate
aiter:
version: de14bec
url: https://github.com/ROCm/aiter
diffusers:
version: 40528e9
url: https://github.com/huggingface/diffusers
xfuser:
version: ccba9d5
url: https://github.com/xdit-project/xDiT
yunchang:
version: 2c9b712
url: https://github.com/feifeibear/long-context-attention
supported_models:
- group: Hunyuan Video
js_tag: hunyuan
models:
- model: Hunyuan Video
model_repo: tencent/HunyuanVideo
revision: refs/pr/18
url: https://huggingface.co/tencent/HunyuanVideo
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
mad_tag: pyt_xdit_hunyuanvideo
js_tag: hunyuan_tag
- group: Wan-AI
js_tag: wan
models:
- model: Wan2.1
model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
github: https://github.com/Wan-Video/Wan2.1
mad_tag: pyt_xdit_wan_2_1
js_tag: wan_21_tag
- model: Wan2.2
model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
github: https://github.com/Wan-Video/Wan2.2
mad_tag: pyt_xdit_wan_2_2
js_tag: wan_22_tag
- group: FLUX
js_tag: flux
models:
- model: FLUX.1
model_repo: black-forest-labs/FLUX.1-dev
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
github: https://github.com/black-forest-labs/flux
mad_tag: pyt_xdit_flux
js_tag: flux_1_tag
- group: Stable Diffusion
js_tag: stablediffusion
models:
- model: stable-diffusion-3.5-large
model_repo: stabilityai/stable-diffusion-3.5-large
url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
github: https://github.com/Stability-AI/sd3.5
mad_tag: pyt_xdit_sd_3_5
js_tag: stable_diffusion_3_5_large_tag

View File

@@ -1,46 +1,48 @@
docker:
pull_tag: rocm/pytorch-xdit:v25.12
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
ROCm: 7.10.0
pull_tag: rocm/pytorch-xdit:v25.13
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
ROCm: 7.11.0
whats_new:
- "Adds T2V and TI2V support for Wan models."
- "Adds support for SD-3.5 T2I model."
- "Flux.1 Kontext support"
- "Flux.2 Dev support"
- "Flux FP8 GEMM support"
- "Hybrid FP8 attention support for Wan models"
components:
TheRock:
version: 3e3f834
version: 1728a81
url: https://github.com/ROCm/TheRock
rccl:
version: d23d18f
url: https://github.com/ROCm/rccl
composable_kernel:
version: 2570462
version: ab0101c
url: https://github.com/ROCm/composable_kernel
rocm-libraries:
version: 0588f07
version: a2f7c35
url: https://github.com/ROCm/rocm-libraries
rocm-systems:
version: 473025a
version: 659737c
url: https://github.com/ROCm/rocm-systems
torch:
version: 73adac
url: https://github.com/pytorch/pytorch
version: 91be249
url: https://github.com/ROCm/pytorch
torchvision:
version: f5c6c2e
version: b919bd0
url: https://github.com/pytorch/vision
triton:
version: 7416ffc
url: https://github.com/triton-lang/triton
version: a272dfa
url: https://github.com/ROCm/triton
accelerate:
version: 34c1779
version: b521400f
url: https://github.com/huggingface/accelerate
aiter:
version: de14bec
version: de14bec0
url: https://github.com/ROCm/aiter
diffusers:
version: 40528e9
version: a1f36ee3e
url: https://github.com/huggingface/diffusers
xfuser:
version: ccba9d5
version: adf2681
url: https://github.com/xdit-project/xDiT
yunchang:
version: 2c9b712
@@ -80,7 +82,19 @@ docker:
github: https://github.com/black-forest-labs/flux
mad_tag: pyt_xdit_flux
js_tag: flux_1_tag
- group: Stable Diffusion
- model: FLUX.1 Kontext
model_repo: black-forest-labs/FLUX.1-Kontext-dev
url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
github: https://github.com/black-forest-labs/flux
mad_tag: pyt_xdit_flux_kontext
js_tag: flux_1_kontext_tag
- model: FLUX.2
model_repo: black-forest-labs/FLUX.2-dev
url: https://huggingface.co/black-forest-labs/FLUX.2-dev
github: https://github.com/black-forest-labs/flux2
mad_tag: pyt_xdit_flux_2
js_tag: flux_2_tag
- group: StableDiffusion
js_tag: stablediffusion
models:
- model: stable-diffusion-3.5-large

View File

@@ -0,0 +1,411 @@
:orphan:
.. meta::
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
prebuilt and optimized docker images.
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
************************
xDiT diffusion inference
************************
.. caution::
This documentation does not reflect the latest version of ROCm vLLM
inference performance documentation. See
:doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
version.
.. _xdit-video-diffusion-2512:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
a prebuilt, optimized environment based on `xDiT
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
and MI300X (gfx942) GPUs.
The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
and includes the following components:
.. dropdown:: Software components
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_data in docker.components.items() %}
* - `{{ component_name }} <{{ component_data.url }}>`_
- {{ component_data.version }}
{% endfor %}
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
What's new
==========
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
{% for item in docker.whats_new %}
* {{ item }}
{% endfor %}
.. _xdit-video-diffusion-supported-models-2512:
Supported models
================
The following models are supported for inference performance benchmarking.
Some instructions, commands, and recommendations in this documentation might
vary by model -- select one to get started.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in docker.supported_models %}
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in docker.supported_models %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
{% for model_group in docker.supported_models %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.js_tag }}
.. note::
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
external license agreement through a third party.
{% endfor %}
{% endfor %}
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Pull the Docker image
=====================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
Pull the image using the following command:
.. code-block:: shell
docker pull {{ docker.pull_tag }}
Validate and benchmark
======================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
Once the image has been downloaded you can follow these steps to
run benchmarks and generate outputs.
{% for model_group in docker.supported_models %}
{% for model in model_group.models %}
.. container:: model-doc {{model.js_tag}}
The following commands are written for {{ model.model }}.
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
{% endfor %}
{% endfor %}
Choose your setup method
------------------------
You can either use an existing Hugging Face cache or download the model fresh inside the container.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
{% for model_group in docker.supported_models %}
{% for model in model_group.models %}
.. container:: model-doc {{model.js_tag}}
.. tab-set::
.. tab-item:: Option 1: Use existing Hugging Face cache
If you already have models downloaded on your host system, you can mount your existing cache.
1. Set your Hugging Face cache location.
.. code-block:: shell
export HF_HOME=/your/hf_cache/location
2. Download the model (if not already cached).
.. code-block:: shell
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
3. Launch the container with mounted cache.
.. code-block:: shell
docker run \
-it --rm \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--user root \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--ipc=host \
--network host \
--privileged \
--shm-size 128G \
--name pytorch-xdit \
-e HSA_NO_SCRATCH_RECLAIM=1 \
-e OMP_NUM_THREADS=16 \
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-e HF_HOME=/app/huggingface_models \
-v $HF_HOME:/app/huggingface_models \
{{ docker.pull_tag }}
.. tab-item:: Option 2: Download inside container
If you prefer to keep the container self-contained or don't have an existing cache.
1. Launch the container
.. code-block:: shell
docker run \
-it --rm \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--user root \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--ipc=host \
--network host \
--privileged \
--shm-size 128G \
--name pytorch-xdit \
-e HSA_NO_SCRATCH_RECLAIM=1 \
-e OMP_NUM_THREADS=16 \
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
{{ docker.pull_tag }}
2. Inside the container, set the Hugging Face cache location and download the model.
.. code-block:: shell
export HF_HOME=/app/huggingface_models
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
.. warning::
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
{% endfor %}
{% endfor %}
Run inference
=============
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
{% set docker = data.docker %}
{% for model_group in docker.supported_models %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.js_tag }}
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. On the host machine, use this command to run the performance benchmark test on
the `{{model.model}} <{{ model.url }}>`_ model using one node.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{model.mad_tag}} \
--keep-model-dir \
--live-output
MAD launches a Docker container with the name
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
and ``{{ model.mad_tag }}_serving.csv``.
.. tab-item:: Standalone benchmarking
To run the benchmarks for {{ model.model }}, use the following command:
.. code-block:: shell
{% if model.model == "Hunyuan Video" %}
cd /app/Hunyuanvideo
mkdir results
torchrun --nproc_per_node=8 run.py \
--model {{ model.model_repo }} \
--prompt "In the large cage, two puppies were wagging their tails at each other." \
--height 720 --width 1280 --num_frames 129 \
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
--ulysses_degree 8 \
--enable_tiling --enable_slicing \
--use_torch_compile \
--bench_output results
{% endif %}
{% if model.model == "Wan2.1" %}
cd Wan
mkdir results
torchrun --nproc_per_node=8 /app/Wan/run.py \
--task i2v \
--height 720 \
--width 1280 \
--model {{ model.model_repo }} \
--img_file_path /app/Wan/i2v_input.JPG \
--ulysses_degree 8 \
--seed 42 \
--num_frames 81 \
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
--num_repetitions 1 \
--num_inference_steps 40 \
--use_torch_compile
{% endif %}
{% if model.model == "Wan2.2" %}
cd Wan
mkdir results
torchrun --nproc_per_node=8 /app/Wan/run.py \
--task i2v \
--height 720 \
--width 1280 \
--model {{ model.model_repo }} \
--img_file_path /app/Wan/i2v_input.JPG \
--ulysses_degree 8 \
--seed 42 \
--num_frames 81 \
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
--num_repetitions 1 \
--num_inference_steps 40 \
--use_torch_compile
{% endif %}
{% if model.model == "FLUX.1" %}
cd Flux
mkdir results
torchrun --nproc_per_node=8 /app/Flux/run.py \
--model {{ model.model_repo }} \
--seed 42 \
--prompt "A small cat" \
--height 1024 \
--width 1024 \
--num_inference_steps 25 \
--max_sequence_length 256 \
--warmup_steps 5 \
--no_use_resolution_binning \
--ulysses_degree 8 \
--use_torch_compile \
--num_repetitions 50
{% endif %}
{% if model.model == "stable-diffusion-3.5-large" %}
cd StableDiffusion3.5
mkdir results
torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
--model {{ model.model_repo }} \
--num_inference_steps 28 \
--prompt "A capybara holding a sign that reads Hello World" \
--use_torch_compile \
--pipefusion_parallel_degree 4 \
--use_cfg_parallel \
--num_repetitions 50 \
--dtype torch.float16 \
--output_path results
{% endif %}
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
{% endfor %}
{% endfor %}
Previous versions
=================
See
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
to find documentation for previous releases of xDiT diffusion inference
performance testing.

View File

@@ -15,12 +15,20 @@ benchmarking, see the version-specific documentation.
- Components
- Resources
* - ``rocm/pytorch-xdit:v25.12`` (latest)
* - ``rocm/pytorch-xdit:v25.13`` (latest)
-
* `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
* TheRock 1728a81
-
* :doc:`Documentation <../../xdit-diffusion-inference>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef>`__
* - ``rocm/pytorch-xdit:v25.12``
-
* `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
* TheRock 3e3f834
-
* :doc:`Documentation <../../xdit-diffusion-inference>`
* :doc:`Documentation <xdit-25.12>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256>`__
* - ``rocm/pytorch-xdit:v25.11``

View File

@@ -22,7 +22,7 @@ xDiT diffusion inference
The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
and includes the following components:
.. dropdown:: Software components
.. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}
.. list-table::
:header-rows: 1
@@ -40,7 +40,6 @@ For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.d
What's new
==========
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
{% set docker = data.docker %}
@@ -105,6 +104,22 @@ vary by model -- select one to get started.
{% endfor %}
{% endfor %}
Performance measurements
========================
To evaluate performance, the `Performance results with AMD ROCm software
<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
page provides reference throughput and serving measurements for inferencing popular AI models.
.. important::
The performance data presented in `Performance results with AMD ROCm
software
<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
only reflects the latest version of this inference benchmarking environment.
The listed measurements should not be interpreted as the peak performance
achievable by AMD Instinct GPUs or ROCm software.
System validation
=================
@@ -311,7 +326,7 @@ Run inference
{% endif %}
{% if model.model == "Wan2.1" %}
cd Wan
cd /app/Wan
mkdir results
torchrun --nproc_per_node=8 /app/Wan/run.py \
@@ -330,7 +345,7 @@ Run inference
{% endif %}
{% if model.model == "Wan2.2" %}
cd Wan
cd /app/Wan
mkdir results
torchrun --nproc_per_node=8 /app/Wan/run.py \
@@ -350,7 +365,7 @@ Run inference
{% endif %}
{% if model.model == "FLUX.1" %}
cd Flux
cd /app/Flux
mkdir results
torchrun --nproc_per_node=8 /app/Flux/run.py \
@@ -369,8 +384,54 @@ Run inference
{% endif %}
{% if model.model == "FLUX.1 Kontext" %}
cd /app/Flux
mkdir results
torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
--model {{ model.model_repo }} \
--seed 42 \
--prompt "Add a cool hat to the cat" \
--height 1024 \
--width 1024 \
--num_inference_steps 30 \
--max_sequence_length 512 \
--warmup_steps 5 \
--no_use_resolution_binning \
--ulysses_degree 8 \
--use_torch_compile \
--img_file_path /app/Flux/cat.png \
--model_type flux_kontext \
--guidance_scale 2.5 \
--num_repetitions 25
{% endif %}
{% if model.model == "FLUX.2" %}
cd /app/Flux
mkdir results
torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
--model {{ model.model_repo }} \
--seed 42 \
--prompt "Add a cool hat to the cat" \
--height 1024 \
--width 1024 \
--num_inference_steps 50 \
--max_sequence_length 512 \
--warmup_steps 5 \
--no_use_resolution_binning \
--ulysses_degree 8 \
--use_torch_compile \
--img_file_paths /app/Flux/cat.png \
--model_type flux2 \
--guidance_scale 4.0 \
--num_repetitions 25
{% endif %}
{% if model.model == "stable-diffusion-3.5-large" %}
cd StableDiffusion3.5
cd /app/StableDiffusion3.5
mkdir results
torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
@@ -386,7 +447,7 @@ Run inference
{% endif %}
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}