From c67fac78bdc1eb7ee010c2da581ecf9b31111742 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Mon, 29 Dec 2025 08:44:45 -0500 Subject: [PATCH] Update docs for xDiT diffusion inference 25.13 Docker release (#5820) * archive previous version * add xdit 25.13 * update history index * add perf results section --- docs/conf.py | 4 + .../xdit_25.10-inference-models.yaml | 2 +- .../xdit_25.11-inference-models.yaml | 2 +- .../xdit_25.12-inference-models.yaml | 91 ++++ .../inference/xdit-inference-models.yaml | 52 ++- .../previous-versions/xdit-25.12.rst | 411 ++++++++++++++++++ .../previous-versions/xdit-history.rst | 12 +- .../inference/xdit-diffusion-inference.rst | 75 +++- 8 files changed, 619 insertions(+), 30 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst diff --git a/docs/conf.py b/docs/conf.py index 42d494cee..9104017e5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -194,6 +194,10 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml index 4aee5d0d5..d2e33f1b9 100644 --- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml @@ -1,7 +1,7 @@ xdit_diffusion_inference: docker: pull_tag: rocm/pytorch-xdit:v25.10 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e ROCm: 7.9.0 components: TheRock: 7afbe45 diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml index e88b4ef0b..f5959dc33 100644 --- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml @@ -2,7 +2,7 @@ xdit_diffusion_inference: docker: - version: v25-11 pull_tag: rocm/pytorch-xdit:v25.11 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216 ROCm: 7.10.0 supported_models: - group: Hunyuan Video diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml new file mode 100644 index 000000000..99d2cab6f --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml @@ -0,0 +1,91 @@ +docker: + pull_tag: rocm/pytorch-xdit:v25.12 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256 + ROCm: 7.10.0 + whats_new: + - "Adds T2V and TI2V support for Wan models." + - "Adds support for SD-3.5 T2I model." + components: + TheRock: + version: 3e3f834 + url: https://github.com/ROCm/TheRock + rccl: + version: d23d18f + url: https://github.com/ROCm/rccl + composable_kernel: + version: 2570462 + url: https://github.com/ROCm/composable_kernel + rocm-libraries: + version: 0588f07 + url: https://github.com/ROCm/rocm-libraries + rocm-systems: + version: 473025a + url: https://github.com/ROCm/rocm-systems + torch: + version: 73adac + url: https://github.com/pytorch/pytorch + torchvision: + version: f5c6c2e + url: https://github.com/pytorch/vision + triton: + version: 7416ffc + url: https://github.com/triton-lang/triton + accelerate: + version: 34c1779 + url: https://github.com/huggingface/accelerate + aiter: + version: de14bec + url: https://github.com/ROCm/aiter + diffusers: + version: 40528e9 + url: https://github.com/huggingface/diffusers + xfuser: + version: ccba9d5 + url: https://github.com/xdit-project/xDiT + yunchang: + version: 2c9b712 + url: https://github.com/feifeibear/long-context-attention + supported_models: + - group: Hunyuan Video + js_tag: hunyuan + models: + - model: Hunyuan Video + model_repo: tencent/HunyuanVideo + revision: refs/pr/18 + url: https://huggingface.co/tencent/HunyuanVideo + github: https://github.com/Tencent-Hunyuan/HunyuanVideo + mad_tag: pyt_xdit_hunyuanvideo + js_tag: hunyuan_tag + - group: Wan-AI + js_tag: wan + models: + - model: Wan2.1 + model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers + github: https://github.com/Wan-Video/Wan2.1 + mad_tag: pyt_xdit_wan_2_1 + js_tag: wan_21_tag + - model: Wan2.2 + model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers + url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers + github: https://github.com/Wan-Video/Wan2.2 + mad_tag: pyt_xdit_wan_2_2 + js_tag: wan_22_tag + - group: FLUX + js_tag: flux + models: + - model: FLUX.1 + model_repo: black-forest-labs/FLUX.1-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux + js_tag: flux_1_tag + - group: Stable Diffusion + js_tag: stablediffusion + models: + - model: stable-diffusion-3.5-large + model_repo: stabilityai/stable-diffusion-3.5-large + url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large + github: https://github.com/Stability-AI/sd3.5 + mad_tag: pyt_xdit_sd_3_5 + js_tag: stable_diffusion_3_5_large_tag diff --git a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml index 8d462524a..67aadcd5a 100644 --- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml +++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml @@ -1,46 +1,48 @@ docker: - pull_tag: rocm/pytorch-xdit:v25.12 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit - ROCm: 7.10.0 + pull_tag: rocm/pytorch-xdit:v25.13 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef + ROCm: 7.11.0 whats_new: - - "Adds T2V and TI2V support for Wan models." - - "Adds support for SD-3.5 T2I model." + - "Flux.1 Kontext support" + - "Flux.2 Dev support" + - "Flux FP8 GEMM support" + - "Hybrid FP8 attention support for Wan models" components: TheRock: - version: 3e3f834 + version: 1728a81 url: https://github.com/ROCm/TheRock rccl: version: d23d18f url: https://github.com/ROCm/rccl composable_kernel: - version: 2570462 + version: ab0101c url: https://github.com/ROCm/composable_kernel rocm-libraries: - version: 0588f07 + version: a2f7c35 url: https://github.com/ROCm/rocm-libraries rocm-systems: - version: 473025a + version: 659737c url: https://github.com/ROCm/rocm-systems torch: - version: 73adac - url: https://github.com/pytorch/pytorch + version: 91be249 + url: https://github.com/ROCm/pytorch torchvision: - version: f5c6c2e + version: b919bd0 url: https://github.com/pytorch/vision triton: - version: 7416ffc - url: https://github.com/triton-lang/triton + version: a272dfa + url: https://github.com/ROCm/triton accelerate: - version: 34c1779 + version: b521400f url: https://github.com/huggingface/accelerate aiter: - version: de14bec + version: de14bec0 url: https://github.com/ROCm/aiter diffusers: - version: 40528e9 + version: a1f36ee3e url: https://github.com/huggingface/diffusers xfuser: - version: ccba9d5 + version: adf2681 url: https://github.com/xdit-project/xDiT yunchang: version: 2c9b712 @@ -80,7 +82,19 @@ docker: github: https://github.com/black-forest-labs/flux mad_tag: pyt_xdit_flux js_tag: flux_1_tag - - group: Stable Diffusion + - model: FLUX.1 Kontext + model_repo: black-forest-labs/FLUX.1-Kontext-dev + url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev + github: https://github.com/black-forest-labs/flux + mad_tag: pyt_xdit_flux_kontext + js_tag: flux_1_kontext_tag + - model: FLUX.2 + model_repo: black-forest-labs/FLUX.2-dev + url: https://huggingface.co/black-forest-labs/FLUX.2-dev + github: https://github.com/black-forest-labs/flux2 + mad_tag: pyt_xdit_flux_2 + js_tag: flux_2_tag + - group: StableDiffusion js_tag: stablediffusion models: - model: stable-diffusion-3.5-large diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst new file mode 100644 index 000000000..66d279e4e --- /dev/null +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst @@ -0,0 +1,411 @@ +:orphan: + +.. meta:: + :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using + prebuilt and optimized docker images. + :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark + +************************ +xDiT diffusion inference +************************ + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + inference performance documentation. See + :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest + version. + +.. _xdit-video-diffusion-2512: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers + a prebuilt, optimized environment based on `xDiT + `_ for benchmarking diffusion model + video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X, + and MI300X (gfx942) GPUs. + + The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ + and includes the following components: + + .. dropdown:: Software components + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_data in docker.components.items() %} + * - `{{ component_name }} <{{ component_data.url }}>`_ + - {{ component_data.version }} + {% endfor %} + +Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark. +For preview and development releases, see `amdsiloai/pytorch-xdit `_. + +What's new +========== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + {% for item in docker.whats_new %} + * {{ item }} + {% endfor %} + +.. _xdit-video-diffusion-supported-models-2512: + +Supported models +================ + +The following models are supported for inference performance benchmarking. +Some instructions, commands, and recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in docker.supported_models %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in docker.supported_models %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.js_tag }} + + .. note:: + + To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ + or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an + external license agreement through a third party. + + {% endfor %} + {% endfor %} + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image. + Pull the image using the following command: + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + +Validate and benchmark +====================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + Once the image has been downloaded you can follow these steps to + run benchmarks and generate outputs. + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{model.js_tag}} + + The following commands are written for {{ model.model }}. + See :ref:`xdit-video-diffusion-supported-models` to switch to another available model. + + {% endfor %} + {% endfor %} + +Choose your setup method +------------------------ + +You can either use an existing Hugging Face cache or download the model fresh inside the container. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + .. container:: model-doc {{model.js_tag}} + + .. tab-set:: + + .. tab-item:: Option 1: Use existing Hugging Face cache + + If you already have models downloaded on your host system, you can mount your existing cache. + + 1. Set your Hugging Face cache location. + + .. code-block:: shell + + export HF_HOME=/your/hf_cache/location + + 2. Download the model (if not already cached). + + .. code-block:: shell + + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + 3. Launch the container with mounted cache. + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + -e HF_HOME=/app/huggingface_models \ + -v $HF_HOME:/app/huggingface_models \ + {{ docker.pull_tag }} + + .. tab-item:: Option 2: Download inside container + + If you prefer to keep the container self-contained or don't have an existing cache. + + 1. Launch the container + + .. code-block:: shell + + docker run \ + -it --rm \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --user root \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --ipc=host \ + --network host \ + --privileged \ + --shm-size 128G \ + --name pytorch-xdit \ + -e HSA_NO_SCRATCH_RECLAIM=1 \ + -e OMP_NUM_THREADS=16 \ + -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + {{ docker.pull_tag }} + + 2. Inside the container, set the Hugging Face cache location and download the model. + + .. code-block:: shell + + export HF_HOME=/app/huggingface_models + huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %} + + .. warning:: + + Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume. + {% endfor %} + {% endfor %} + +Run inference +============= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml + + {% set docker = data.docker %} + + {% for model_group in docker.supported_models %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.js_tag }} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. On the host machine, use this command to run the performance benchmark test on + the `{{model.model}} <{{ model.url }}>`_ model using one node. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{model.mad_tag}} \ + --keep-model-dir \ + --live-output + + MAD launches a Docker container with the name + ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the + model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv`` + and ``{{ model.mad_tag }}_serving.csv``. + + .. tab-item:: Standalone benchmarking + + To run the benchmarks for {{ model.model }}, use the following command: + + .. code-block:: shell + {% if model.model == "Hunyuan Video" %} + cd /app/Hunyuanvideo + mkdir results + + torchrun --nproc_per_node=8 run.py \ + --model {{ model.model_repo }} \ + --prompt "In the large cage, two puppies were wagging their tails at each other." \ + --height 720 --width 1280 --num_frames 129 \ + --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \ + --ulysses_degree 8 \ + --enable_tiling --enable_slicing \ + --use_torch_compile \ + --bench_output results + + {% endif %} + {% if model.model == "Wan2.1" %} + cd Wan + mkdir results + + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + + {% endif %} + {% if model.model == "Wan2.2" %} + cd Wan + mkdir results + + torchrun --nproc_per_node=8 /app/Wan/run.py \ + --task i2v \ + --height 720 \ + --width 1280 \ + --model {{ model.model_repo }} \ + --img_file_path /app/Wan/i2v_input.JPG \ + --ulysses_degree 8 \ + --seed 42 \ + --num_frames 81 \ + --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \ + --num_repetitions 1 \ + --num_inference_steps 40 \ + --use_torch_compile + + {% endif %} + + {% if model.model == "FLUX.1" %} + cd Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "A small cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 25 \ + --max_sequence_length 256 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --num_repetitions 50 + + {% endif %} + + {% if model.model == "stable-diffusion-3.5-large" %} + cd StableDiffusion3.5 + mkdir results + + torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ + --model {{ model.model_repo }} \ + --num_inference_steps 28 \ + --prompt "A capybara holding a sign that reads Hello World" \ + --use_torch_compile \ + --pipefusion_parallel_degree 4 \ + --use_cfg_parallel \ + --num_repetitions 50 \ + --dtype torch.float16 \ + --output_path results + + {% endif %} + + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} + + {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %} + + {% endfor %} + {% endfor %} + +Previous versions +================= + +See +:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history` +to find documentation for previous releases of xDiT diffusion inference +performance testing. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst index dd6158857..a8c1e924a 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst @@ -15,12 +15,20 @@ benchmarking, see the version-specific documentation. - Components - Resources - * - ``rocm/pytorch-xdit:v25.12`` (latest) + * - ``rocm/pytorch-xdit:v25.13`` (latest) + - + * `ROCm 7.10.0 preview `__ + * TheRock 1728a81 + - + * :doc:`Documentation <../../xdit-diffusion-inference>` + * `Docker Hub `__ + + * - ``rocm/pytorch-xdit:v25.12`` - * `ROCm 7.10.0 preview `__ * TheRock 3e3f834 - - * :doc:`Documentation <../../xdit-diffusion-inference>` + * :doc:`Documentation ` * `Docker Hub `__ * - ``rocm/pytorch-xdit:v25.11`` diff --git a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst index b9b1da113..d55412a2d 100644 --- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst +++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst @@ -22,7 +22,7 @@ xDiT diffusion inference The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock `_ and includes the following components: - .. dropdown:: Software components + .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }} .. list-table:: :header-rows: 1 @@ -40,7 +40,6 @@ For preview and development releases, see `amdsiloai/pytorch-xdit `__ +page provides reference throughput and serving measurements for inferencing popular AI models. + +.. important:: + + The performance data presented in `Performance results with AMD ROCm + software + `__ + only reflects the latest version of this inference benchmarking environment. + The listed measurements should not be interpreted as the peak performance + achievable by AMD Instinct GPUs or ROCm software. + System validation ================= @@ -311,7 +326,7 @@ Run inference {% endif %} {% if model.model == "Wan2.1" %} - cd Wan + cd /app/Wan mkdir results torchrun --nproc_per_node=8 /app/Wan/run.py \ @@ -330,7 +345,7 @@ Run inference {% endif %} {% if model.model == "Wan2.2" %} - cd Wan + cd /app/Wan mkdir results torchrun --nproc_per_node=8 /app/Wan/run.py \ @@ -350,7 +365,7 @@ Run inference {% endif %} {% if model.model == "FLUX.1" %} - cd Flux + cd /app/Flux mkdir results torchrun --nproc_per_node=8 /app/Flux/run.py \ @@ -369,8 +384,54 @@ Run inference {% endif %} + {% if model.model == "FLUX.1 Kontext" %} + cd /app/Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "Add a cool hat to the cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 30 \ + --max_sequence_length 512 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --img_file_path /app/Flux/cat.png \ + --model_type flux_kontext \ + --guidance_scale 2.5 \ + --num_repetitions 25 + + {% endif %} + + {% if model.model == "FLUX.2" %} + cd /app/Flux + mkdir results + + torchrun --nproc_per_node=8 /app/Flux/run_usp.py \ + --model {{ model.model_repo }} \ + --seed 42 \ + --prompt "Add a cool hat to the cat" \ + --height 1024 \ + --width 1024 \ + --num_inference_steps 50 \ + --max_sequence_length 512 \ + --warmup_steps 5 \ + --no_use_resolution_binning \ + --ulysses_degree 8 \ + --use_torch_compile \ + --img_file_paths /app/Flux/cat.png \ + --model_type flux2 \ + --guidance_scale 4.0 \ + --num_repetitions 25 + + {% endif %} + {% if model.model == "stable-diffusion-3.5-large" %} - cd StableDiffusion3.5 + cd /app/StableDiffusion3.5 mkdir results torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \ @@ -386,7 +447,7 @@ Run inference {% endif %} - The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} + The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %} {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}