From 2d79b3c4bdfd7ff6befd509eab616d3bd6a385c2 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 30 Sep 2025 17:34:45 +0000
Subject: [PATCH 01/24] [Ex CI] Added rocm-cmake dependency

---
 .azuredevops/components/hipBLASLt.yml   | 1 +
 .azuredevops/components/hipSPARSELt.yml | 2 ++
 .azuredevops/components/rocBLAS.yml     | 1 +
 3 files changed, 4 insertions(+)

diff --git a/.azuredevops/components/hipBLASLt.yml b/.azuredevops/components/hipBLASLt.yml
index ba324cddf..0eea974d5 100644
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -77,6 +77,7 @@ parameters:
     - clr
     - hipBLAS-common
     - llvm-project
+    - rocm-cmake
     - rocminfo
     - rocm_smi_lib
     - rocprofiler-register
diff --git a/.azuredevops/components/hipSPARSELt.yml b/.azuredevops/components/hipSPARSELt.yml
index 0544d019f..35f871c71 100644
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -54,6 +54,7 @@ parameters:
     - hipSPARSE
     - llvm-project
     - rocBLAS
+    - rocm-cmake
     - rocm_smi_lib
     - rocminfo
     - rocprofiler-register
@@ -67,6 +68,7 @@ parameters:
     - llvm-project
     - hipBLAS-common
     - hipBLASLt
+    - rocm-cmake
     - rocBLAS
     - rocminfo
     - rocprofiler-register
diff --git a/.azuredevops/components/rocBLAS.yml b/.azuredevops/components/rocBLAS.yml
index ab6765a07..603cbe65b 100644
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -70,6 +70,7 @@ parameters:
     - hipBLAS-common
     - hipBLASLt
     - llvm-project
+    - rocm-cmake
     - rocminfo
     - rocprofiler-register
     - rocm_smi_lib

From 2e1b4dd5ee5cb2e84b7ad9414ecddf12f3e8bda9 Mon Sep 17 00:00:00 2001
From: peterjunpark <peter.park@amd.com>
Date: Tue, 30 Sep 2025 14:53:38 -0400
Subject: [PATCH 02/24] Add multi-node setup instructions for training perf
 Dockers (#5449)

---------

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>
---
 .wordlist.txt                                 |   1 +
 .../benchmark-docker/pytorch-inference.rst    |   4 +-
 .../benchmark-docker/sglang-distributed.rst   |   6 +-
 .../inference/benchmark-docker/vllm.rst       |  12 +-
 docs/how-to/rocm-for-ai/install.rst           |   2 +-
 .../how-to/rocm-for-ai/system-setup/index.rst |  40 +++
 .../system-setup/multi-node-setup.rst         | 320 ++++++++++++++++++
 .../prerequisite-system-validation.rst        |   2 -
 .../system-health-check.rst                   |  10 +-
 .../training/benchmark-docker/jax-maxtext.rst |  90 +----
 .../training/benchmark-docker/megatron-lm.rst |  86 +----
 .../benchmark-docker/primus-megatron.rst      |  27 +-
 .../benchmark-docker/primus-pytorch.rst       |  10 +-
 .../benchmark-docker/pytorch-training.rst     |  15 +-
 docs/sphinx/_toc.yml.in                       |  11 +-
 15 files changed, 444 insertions(+), 192 deletions(-)
 create mode 100644 docs/how-to/rocm-for-ai/system-setup/index.rst
 create mode 100644 docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
 rename docs/how-to/rocm-for-ai/{training => system-setup}/prerequisite-system-validation.rst (99%)
 rename docs/how-to/rocm-for-ai/{ => system-setup}/system-health-check.rst (96%)

diff --git a/.wordlist.txt b/.wordlist.txt
index 982320da9..70cdba47a 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -43,6 +43,7 @@ Blit
 Blockwise
 Bluefield
 Bootloader
+Broadcom
 CAS
 CCD
 CDNA
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
index ad8db53c4..21ee1b647 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -16,7 +16,7 @@ PyTorch inference performance testing
 
    The `ROCm PyTorch Docker <https://hub.docker.com/r/rocm/pytorch/tags>`_ image offers a prebuilt,
    optimized environment for testing model inference performance on AMD Instinct™ MI300X series
-   accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
+   GPUs. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
    tool with the ROCm PyTorch container to test inference performance on various models efficiently.
 
    .. _pytorch-inference-benchmark-available-models:
@@ -175,7 +175,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - For application performance optimization strategies for HPC and AI workloads,
   including inference with vLLM, see :doc:`../../inference-optimization/workload`.
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
index 2828dba95..17e5ea54b 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
@@ -23,7 +23,7 @@ improved efficiency and throughput.
    serving engine for large language models (LLMs) and vision models. The
    ROCm-enabled `SGLang base Docker image <{{ docker.docker_hub_url }}>`__
    bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X series
-   accelerators. It includes the following software components:
+   GPUs. It includes the following software components:
 
    .. list-table::
       :header-rows: 1
@@ -37,7 +37,7 @@ improved efficiency and throughput.
       {% endfor %}
 
 The following guides on setting up and running SGLang and Mooncake for disaggregated
-distributed inference on a Slurm cluster using AMD Instinct MI300X series accelerators backed by
+distributed inference on a Slurm cluster using AMD Instinct MI300X series GPUs backed by
 Mellanox CX-7 NICs.
 
 Prerequisites
@@ -236,7 +236,7 @@ Further reading
 - See the base upstream Docker image on `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
+  MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
 
 - For application performance optimization strategies for HPC and AI workloads,
   including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
index ac7bf7fb8..70121c67f 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -14,9 +14,9 @@ vLLM inference performance testing
 
    The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
    a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   inference performance on AMD Instinct™ MI300X series GPUs. This ROCm vLLM
    Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
+   GPUs and includes the following components:
 
    .. list-table::
       :header-rows: 1
@@ -31,7 +31,7 @@ vLLM inference performance testing
 
 With this Docker image, you can quickly test the :ref:`expected
 inference performance numbers <vllm-benchmark-performance-measurements-909>` for
-MI300X series accelerators.
+MI300X series GPUs.
 
 What's new
 ==========
@@ -101,7 +101,7 @@ Supported models
          See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
          Some models require access authorization prior to use via an external license agreement through a third party.
       {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
       {% endif %}
 
       {% endfor %}
@@ -121,7 +121,7 @@ page provides reference throughput and serving measurements for inferencing popu
    The performance data presented in
    `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
    only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
 
 System validation
 =================
@@ -423,7 +423,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   a brief introduction to vLLM and optimization strategies.
diff --git a/docs/how-to/rocm-for-ai/install.rst b/docs/how-to/rocm-for-ai/install.rst
index 218f97145..ff7011e2d 100644
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -57,4 +57,4 @@ Next steps
 ==========
 
 After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
-to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
+to test the optimal performance of your AMD hardware. See :doc:`system-setup/index` to get started.
diff --git a/docs/how-to/rocm-for-ai/system-setup/index.rst b/docs/how-to/rocm-for-ai/system-setup/index.rst
new file mode 100644
index 000000000..466c1ba2f
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/system-setup/index.rst
@@ -0,0 +1,40 @@
+.. meta::
+   :description: System setup and validation steps for AI training and inference on ROCm
+   :keywords: AMD Instinct, ROCm, GPU, AI, training, inference, benchmarking, performance, validation
+
+*************************************
+System setup for AI workloads on ROCm
+*************************************
+
+Before you begin training or inference on AMD Instinct™ GPUs, complete
+the following system setup and validation steps to ensure optimal performance.
+
+Prerequisite system validation
+==============================
+
+First, confirm that your system meets all software and hardware prerequisites.
+See :doc:`prerequisite-system-validation`.
+
+Docker images for AMD Instinct GPUs
+===================================
+
+AMD provides prebuilt Docker images for AMD Instinct™ MI300X and MI325X
+GPUs. These images include ROCm-enabled deep learning frameworks and
+essential software components. They support single-node and multi-node configurations
+and are ready for training and inference workloads out of the box.
+
+Multi-node training
+-------------------
+
+For instructions on enabling multi-node training, see :doc:`multi-node-setup`.
+
+System optimization and validation
+==================================
+
+Before running workloads, verify that the system is configured correctly and
+operating at peak efficiency. Recommended steps include:
+
+- Disabling NUMA auto-balancing
+- Running system benchmarks to validate hardware performance
+
+For details on running system health checks, see :doc:`system-health-check`.
diff --git a/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
new file mode 100644
index 000000000..739a9c8e8
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
@@ -0,0 +1,320 @@
+.. meta::
+   :description: Multi-node setup for AI training
+   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training
+
+.. _rocm-for-ai-multi-node-setup:
+
+*********************************
+Multi-node setup for AI workloads
+*********************************
+
+AMD provides ready-to-use Docker images for AMD Instinct™ MI300X and MI325X
+GPUs containing ROCm-capable deep learning frameworks and essential
+software components. These Docker images can run and leverage multiple nodes if
+they are available. This page describes how to enable the multi-node training
+of AI workloads on AMD Instinct GPUs.
+
+Prerequisites
+=============
+
+Before starting, ensure your environment meets the following requirements:
+
+* Multi-node networking: your cluster should have a configured multi-node network. For setup
+  instructions, see the `Multi-node network configuration for AMD Instinct
+  accelerators
+  <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
+  guide in the Instinct documentation.
+
+* ROCm Docker container to simplify environment setup for AI workloads. See the following resources to get started:
+
+  * :doc:`Training a model with Megatron-LM and ROCm <../training/benchmark-docker/megatron-lm>`
+
+  * :doc:`Training a model with PyTorch and ROCm <../training/benchmark-docker/pytorch-training>`
+
+  * :doc:`Training a model with JAX MaxText and ROCm <../training/benchmark-docker/jax-maxtext>`
+
+* Slurm workload manager to run the :ref:`provided examples <multi-node-setup-training-examples>`.
+
+Install required packages
+=========================
+
+To run multi-node workloads, ensure you have all the required packages installed based on your
+network device. For example, on Ubuntu systems:
+
+.. code-block:: shell
+
+   apt install -y iproute2
+
+   apt install -y linux-headers-"$(uname -r)" libelf-dev
+
+   apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+Compile and install the RoCE library
+------------------------------------
+
+If you're using Broadcom NICs, you need to compile and install the RoCE (RDMA
+over Converged Ethernet) library. See `RoCE cluster network configuration guide
+for AMD Instinct accelerators
+<https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html#roce-cluster-network-configuration-guide-for-amd-instinct-accelerators>`__
+for more information.
+
+See the `Ethernet networking guide for AMD
+Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source
+<https://docs.broadcom.com/doc/957608-AN2XX#page=81>`_ for more details.
+
+.. important::
+
+   It is crucial to install the exact same version of the RoCE library that
+   is installed on your host system. Also, ensure that the path to these
+   libraries on the host is correctly mounted into your Docker container.
+   Failure to do so can lead to compatibility issues and communication
+   failures.
+
+1. Set ``BUILD_DIR`` to the path on the host system where the Broadcom drivers and ``bnxt_rocelib`` source are located.
+   Then, navigate to the ``bnxt_rocelib`` directory.
+
+   .. code-block:: shell
+
+      export BUILD_DIR=/path/to/your/broadcom_drivers_on_host
+      cd $BUILD_DIR/drivers_linux/bnxt_rocelib/
+
+2. The ``bnxt_rocelib`` directory contains a version of ``libbnxt_re`` in a zipped ``.tar.gz`` file.
+
+   .. code-block:: shell
+
+      tar -xf libbnxt_re-a.b.c.d.tar.gz
+      cd libbnxt_re-a.b.c.d
+
+3. Compile and install the RoCE library.
+
+   .. code-block:: shell
+
+      sh autogen.sh
+      ./configure
+      make
+      find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \;
+      make install all
+      sh -c "echo /usr/local/lib >> /etc/ld.so.conf"
+      ldconfig
+      cp -f bnxt_re.driver /etc/libibverbs.d/
+      find . -name "*.so" -exec md5sum {} \;
+      BUILT_MD5SUM=$(find . -name "libbnxt_re-rdmav*.so" -exec md5sum {} \; | cut -d " " -f 1)
+
+Environment setup
+=================
+
+Before running multi-node workloads, set these essential environment variables:
+
+Master address
+--------------
+
+By default, ``localhost`` is used for single-node configurations. Change
+``localhost`` to the master node's resolvable hostname or IP address:
+
+.. code-block:: bash
+
+   export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+Number of nodes
+---------------
+
+Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+.. code-block:: bash
+
+   export NNODES="${NNODES:-<num_nodes>}"
+
+Node ranks
+----------
+
+Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on).
+Node ranks should be unique across all nodes in the cluster.
+
+.. code-block:: bash
+
+   export NODE_RANK="${NODE_RANK:-<node_rank>}"
+
+Network interface
+-----------------
+
+Update the network interface in the script to match your system's network interface. To
+find your network interface, run the following (outside of any Docker container):
+
+.. code-block:: bash
+
+   ip a
+
+Look for an active interface (status "UP") with an IP address in the same subnet as
+your other nodes. Then, update the following variable in the script, for
+example:
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=ens50f0np0
+
+This variable specifies which network interface to use for inter-node communication.
+Setting this variable to the incorrect interface can result in communication failures
+or significantly reduced performance.
+
+.. tip::
+
+  This command sets ``NCCL_SOCKET_IFNAME``'s value to the last RDMA interface.
+
+  .. code-block:: bash
+
+     export NCCL_SOCKET_IFNAME=$(rdma link show | awk '{print $NF}' | sort | tail -n1)
+
+RDMA/IB interface
+-----------------
+
+Set the RDMA interfaces to be used for communication. NICs can come from different vendors and the names of the RDMA interface can be different. To get the list of all the RDMA/IB devices, run:
+
+.. code-block:: bash
+
+   ibv_devices
+
+The command below gets the list of all RDMA/IB devices and puts them in a
+comma-separated format. If
+(``rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7``) are your RDMA
+interfaces, then set:
+
+.. code-block:: bash
+
+   # If using Broadcom NIC
+   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+   # If using Mellanox NIC
+   # export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. tip::
+
+  Alternatively, if you want to choose the RDMA interface automatically, you
+  can use the following. This command will sort the RDMA interfaces and then
+  select the first eight RDMA interfaces.
+
+  .. code-block:: bash
+
+     export NCCL_IB_HCA=$(ibv_devices | awk 'NR>2 {print $1}' | sort | head -n 8 | paste -sd,)
+
+Global ID index
+---------------
+
+Update the global ID index if you're using RoCE.
+
+.. code-block:: bash
+
+   export NCCL_IB_GID_INDEX=3
+
+.. _multi-node-setup-training-examples:
+
+Multi-node training examples
+============================
+
+The following examples use the Slurm workload manager to launch jobs on
+multiple nodes. To run these scripts as-is, you must have a Slurm environment
+configured. The scripts are designed to work with both Broadcom Thor 2 and
+Mellanox NICs by automatically installing the required libraries and setting
+the necessary environment variables. For systems with Broadcom NICs, the
+scripts assume the host's RoCE library is located in the ``/opt`` directory.
+
+The following benchmarking examples demonstrate the training of a Llama 3 8B model
+across multiple 8-GPU nodes, using FSDP for intra-node parallelism and DP for
+inter-node parallelism.
+
+.. _rocm-for-ai-multi-node-setup-jax-train-example:
+
+JAX MaxText
+-----------
+
+1. Download the desired multi-node benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+
+   .. code-block:: shell
+
+      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/jax-maxtext/gpu-rocm/llama3_8b_multinode.sh
+
+   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd scripts/jax-maxtext/gpu-rocm
+
+2. Run the benchmark for multi-node training.
+
+   .. code-block:: shell
+
+      sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+.. _rocm-for-ai-multi-node-setup-pyt-train-example:
+
+PyTorch training
+----------------
+
+.. note::
+
+   The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
+   with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
+   following example refers to the legacy workflow :ref:`Training a
+   model with PyTorch <amd-pytorch-training-multinode-examples>`.
+
+1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
+
+   .. code-block:: shell
+
+      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/pytorch_train/run_multinode_train.sh
+
+   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/MAD
+      cd scripts/pytorch_train
+
+2. Run the benchmark for multi-node training.
+
+   .. code-block:: shell
+
+      sbatch -N <num_nodes> run_multinode_train.sh
+
+.. seealso::
+
+   See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
+
+Megatron-LM
+-----------
+
+.. note::
+
+   The Megatron-LM Docker image now focuses on :ref:`Training a model with
+   Primus and Megatron <amd-primus-megatron-multi-node-examples>`. The
+   following example refers to the legacy Megatron-LM :ref:`Training a model
+   with Megatron-LM <amd-megatron-lm-multi-node-examples>` and might have
+   limited support.
+
+1. Download the ``train_llama_slurm.sh`` benchmarking script from
+   `<https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama_slurm.sh>`__.
+
+2. Set the network interface parameters as per the above guidelines and run the script.
+
+   .. code-block:: shell
+
+      cd </path/to/your/Megatron-LM>
+      export NETWORK_INTERFACE=$NCCL_SOCKET_IFNAME
+      export NCCL_IB_HCA=$NCCL_IB_HCA
+      export IMAGE=docker.io/rocm/megatron-lm:latest OR your preferred image
+      export DATA_CACHE_PATH=/nfs/mounted/repo
+
+      sbatch –N <num_nodes> examples/llama/train_llama_slurm.sh <MODEL_SIZE> <MBS> <GBS> <SEQ_LENGTH> <FSDP> <RECOMPUTE>
+
+2. For example, to run a Llama 3 8B workload in BF16 precision, use the following command.
+
+   .. code-block:: shell
+
+      MODEL_NAME=llama3 sbatch –N 8 examples/llama/train_llama_slurm.sh 8 2 128 8192 0 0
+      # Other parameters, such as TP, FP8 datatype, can be adjusted in the script.
+
+Further reading
+===============
+
+* `Multi-node network configuration for AMD Instinct accelerators <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
+
+* `Ethernet networking guide for AMD Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#page=81>`__
diff --git a/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst b/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
similarity index 99%
rename from docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
rename to docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
index 68ce4e493..60aedecfe 100644
--- a/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
@@ -1,5 +1,3 @@
-:orphan:
-
 .. meta::
    :description: Prerequisite system validation before using ROCm for AI.
    :keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
diff --git a/docs/how-to/rocm-for-ai/system-health-check.rst b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
similarity index 96%
rename from docs/how-to/rocm-for-ai/system-health-check.rst
rename to docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
index 2014556bc..79563b61f 100644
--- a/docs/how-to/rocm-for-ai/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -1,12 +1,14 @@
+:orphan:
+
 .. meta::
    :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
    :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
 
 .. _rocm-for-ai-system-health-bench:
 
-************************
-System health benchmarks
-************************
+*****************************************
+System health benchmarks for AI workloads
+*****************************************
 
 Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
 
@@ -62,7 +64,7 @@ RCCL tests
 The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
 communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
 the performance and verifies the correctness of these collective operations.
-This helps ensure optimal scaling for multi-accelerator tasks.
+This helps ensure optimal scaling for multi-GPU tasks.
 
 1. To get started, build RCCL-tests using the official instructions in the README at
    `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
index 49d8acdc0..eb56f4dce 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -10,10 +10,10 @@ MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
 ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
-on AMD MI300X series accelerators.
+on AMD MI300X series GPUs.
 
 The MaxText for ROCm training Docker image
-provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X GPUs,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:
 
@@ -69,7 +69,7 @@ Supported models
 ================
 
 The following models are pre-optimized for performance on AMD Instinct MI300
-series accelerators. Some instructions, commands, and available training
+series GPUs. Some instructions, commands, and available training
 configurations in this documentation might vary by model -- select one to get
 started.
 
@@ -134,85 +134,11 @@ doesn’t validate configurations and run conditions outside those described.
 
 .. _amd-maxtext-multi-node-setup-v257:
 
-Multi-node setup
-----------------
+Multi-node configuration
+------------------------
 
-For multi-node environments, ensure you have all the necessary packages for
-your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
-
-1. Install the following packages to build and install the RDMA driver.
-
-   .. code-block:: shell
-
-      sudo apt install iproute2 -y
-      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
-      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-   Refer to your NIC manufacturer's documentation for further steps on
-   compiling and installing the RoCE driver. For example, for Broadcom,
-   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
-   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
-
-2. Set the following environment variables.
-
-   a. Master address
-
-      Change ``localhost`` to the master node's resolvable hostname or IP address:
-
-      .. code-block:: bash
-
-         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-   b. Number of nodes
-
-      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
-
-      .. code-block:: bash
-
-         export NNODES="${NNODES:-1}"
-
-   c. Node ranks
-
-      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
-      Node ranks should be unique across all nodes in the cluster.
-
-      .. code-block:: bash
-
-         export NODE_RANK="${NODE_RANK:-0}"
-
-   d. Network interface
-
-      Update the network interface in the script to match your system's network interface. To
-      find your network interface, run the following (outside of any Docker container):
-
-      .. code-block:: bash
-
-         ip a
-
-      Look for an active interface with an IP address in the same subnet as
-      your other nodes. Then, update the following variable in the script, for
-      example:
-
-      .. code-block:: bash
-
-         export NCCL_SOCKET_IFNAME=ens50f0np0
-
-      This variable specifies which network interface to use for inter-node communication.
-      Setting this variable to the incorrect interface can result in communication failures
-      or significantly reduced performance.
-
-   e. RDMA interface
-
-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
-      Then, set the RDMA interfaces to use for communication.
-
-      .. code-block:: bash
-
-         # If using Broadcom NIC
-         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-         # If using Mellanox NIC
-         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
+environment for multi-node training.
 
 .. _amd-maxtext-get-started-v257:
 
@@ -399,7 +325,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - For a list of other ready-made Docker images for AI with ROCm, see
   `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
index 5a2f610d4..ebd55be17 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -10,20 +10,20 @@ Training a model with Megatron-LM on ROCm
 
 .. caution::
 
-   Primus with Megatron supersedes this ROCm Megatron-LM training workflow.
+   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
    To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
    see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
 
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
-Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
+Instinct™ MI300X series GPUs, Megatron-LM delivers enhanced
 scalability, performance, and resource utilization for AI workloads. It is
 purpose-built to support models like Llama, DeepSeek, and Mixtral,
 enabling developers to train next-generation AI models more
 efficiently.
 
-AMD provides ready-to-use Docker images for MI300X series accelerators containing
+AMD provides ready-to-use Docker images for MI300X series GPUs containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
@@ -61,7 +61,7 @@ workloads:
    ================
 
    The following models are supported for training performance benchmarking with Megatron-LM and ROCm
-   on AMD Instinct MI300X series accelerators.
+   on AMD Instinct MI300X series GPUs.
    Some instructions, commands, and training recommendations in this documentation might
    vary by model -- select one to get started.
 
@@ -115,7 +115,7 @@ popular AI models.
    The performance data presented in
    `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
    only reflects the latest version of this training benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
 
 System validation
 =================
@@ -138,11 +138,11 @@ Environment setup
 =================
 
 Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+reproduce the benchmark results on MI300X series GPUs with the AMD Megatron-LM Docker
 image.
 
 .. _amd-megatron-lm-requirements:
- 
+
 Download the Docker image
 -------------------------
 
@@ -152,7 +152,7 @@ Download the Docker image
    1. Use the following command to pull the Docker image from Docker Hub.
 
       {% if dockers|length > 1 %}
-      .. tab-set:: 
+      .. tab-set::
 
          {% for docker in data.dockers %}
          .. tab-item:: {{ docker.doc_name }}
@@ -281,25 +281,11 @@ Configuration
 
    See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
 
-Network interface
------------------
+Multi-node configuration
+------------------------
 
-Update the network interface in the script to match your system's network interface. To
-find your network interface, run the following (outside of any Docker container):
-
-.. code-block:: bash
-
-   ip a
-
-Look for an active interface that has an IP address in the same subnet as
-your other nodes. Then, update the following variables in the script, for
-example:
-
-.. code-block:: bash
-
-   export NCCL_SOCKET_IFNAME=ens50f0np0
-
-   export GLOO_SOCKET_IFNAME=ens50f0np0
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
 
 .. _amd-megatron-lm-tokenizer:
 
@@ -540,46 +526,6 @@ Download the dataset
 
    Ensure that the files are accessible inside the Docker container.
 
-Multi-node configuration
-------------------------
-
-If you're running multi-node training, update the following environment variables. They can
-also be passed as command line arguments. Refer to the following example configurations.
-
-* Change ``localhost`` to the master node's hostname:
-
-  .. code-block:: shell
-
-     MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
-
-  .. code-block:: shell
-
-     NNODES="${NNODES:-1}"
-
-* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
-
-  .. code-block:: shell
-
-     NODE_RANK="${NODE_RANK:-0}"
-
-* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
-  NFS directory) for multi-node runs:
-
-  .. code-block:: shell
-
-     DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
-
-* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
-  inside a Docker container, either install the drivers inside the Docker container or pass the network
-  drivers from the host while creating the Docker container.
-
-  .. code-block:: shell
-
-     # Specify which RDMA interfaces to use for communication
-     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-
 .. _amd-megatron-lm-run-training:
 
 Run training
@@ -587,7 +533,7 @@ Run training
 
 Use the following example commands to set up the environment, configure
 :ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
-MI300X series accelerators with the AMD Megatron-LM environment.
+MI300X series GPUs with the AMD Megatron-LM environment.
 
 Single node training
 --------------------
@@ -612,7 +558,7 @@ Single node training
       FSDP=1 \
       MODEL_SIZE=70 \
       TOTAL_ITERS=50 \
-      bash examples/llama/train_llama3.sh 
+      bash examples/llama/train_llama3.sh
 
    .. note::
 
@@ -770,7 +716,7 @@ Single node training
 
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
 
-   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
    navigate to the Megatron-LM folder and use the following command.
 
    .. code-block:: shell
@@ -925,6 +871,8 @@ Single node training
           RECOMPUTE_ACTIVATIONS=full \
           CKPT_FORMAT=torch_dist
 
+.. _amd-megatron-lm-multi-node-examples:
+
 Multi-node training examples
 ----------------------------
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
index 2c4e03e51..853d24395 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -8,16 +8,16 @@ Training a model with Primus and Megatron-LM
 
 `Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
 LLM training framework designed to streamline training. It streamlines LLM
-training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
 Primus is backend-agnostic and supports multiple training engines -- including Megatron.
 
 .. note::
 
-   Primus with Megatron supersedes the :doc:`ROCm Megatron-LM training <megatron-lm>` workflow.
+   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM training <megatron-lm>` workflow.
    To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
    see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
 
-For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
+For ease of use, AMD provides a ready-to-use Docker image for MI300 series GPUs
 containing essential components for Primus and Megatron-LM. This Docker is powered by Primus
 Turbo optimizations for performance; this release adds support for Primus Turbo
 with optimized attention and grouped GEMM kernels.
@@ -47,7 +47,7 @@ with optimized attention and grouped GEMM kernels.
 Supported models
 ================
 
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+The following models are pre-optimized for performance on AMD Instinct MI300X series GPUs.
 Some instructions, commands, and training examples in this documentation might
 vary by model -- select one to get started.
 
@@ -114,7 +114,7 @@ system's configuration.
    =================
 
    Use the following instructions to set up the environment, configure the script to train models, and
-   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
+   reproduce the benchmark results on MI300X series GPUs with the ``{{ docker.pull_tag }}`` image.
 
    .. _amd-primus-megatron-lm-requirements:
 
@@ -229,7 +229,7 @@ Run training
 
 Use the following example commands to set up the environment, configure
 :ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
-MI300X series accelerators with the AMD Megatron-LM environment.
+MI300X series GPUs with the AMD Megatron-LM environment.
 
 Single node training
 --------------------
@@ -341,7 +341,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
    .. code-block:: shell
 
       EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50 
+      bash ./examples/run_pretrain.sh --train_iters 50
 
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
 
@@ -349,7 +349,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
    The following run commands are tailored to DeepSeek-V3.
    See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
 
-   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
    use the following command:
 
    .. code-block:: shell
@@ -445,9 +445,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
       EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
       bash examples/run_pretrain.sh --train_iters 50
 
+.. _amd-primus-megatron-multi-node-examples:
+
 Multi-node training examples
 ----------------------------
 
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training.
+
 To run training on multiple nodes, you can use the
 `run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/run_slurm_pretrain.sh>`__
 to launch the multi-node workload. Use the following steps to setup your environment:
@@ -505,7 +510,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    .. code-block:: shell
 
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
       NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
       bash ./examples/run_slurm_pretrain.sh \
           --global_batch_size 1024 \
@@ -540,7 +545,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    .. code-block:: shell
 
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
       NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
 
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
@@ -639,7 +644,7 @@ Further reading
   Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - For a list of other ready-made Docker images for AI with ROCm, see
   `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
index 51f1ce57e..b2bd5fb87 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -8,12 +8,12 @@ Training a model with Primus and PyTorch
 
 `Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
 LLM training framework designed to streamline training. It streamlines LLM
-training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
 Primus now supports the PyTorch torchtitan backend.
 
 .. note::
 
-   Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
+   Primus with the PyTorch torchtitan backend is designed to replace the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
    See :doc:`pytorch-training` to see steps to run workloads without Primus.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -21,7 +21,7 @@ Primus now supports the PyTorch torchtitan backend.
    {% set dockers = data.dockers %}
    {% set docker = dockers[0] %}
    For ease of use, AMD provides a ready-to-use Docker image -- ``{{
-   docker.pull_tag }}`` -- for MI300X series accelerators containing essential
+   docker.pull_tag }}`` -- for MI300X series GPUs containing essential
    components for Primus and PyTorch training with
    Primus Turbo optimizations.
 
@@ -41,7 +41,7 @@ Primus now supports the PyTorch torchtitan backend.
 Supported models
 ================
 
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
 Some instructions, commands, and training recommendations in this documentation might
 vary by model -- select one to get started.
 
@@ -293,7 +293,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - For a list of other ready-made Docker images for AI with ROCm, see
   `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
index 59e86c4f9..88842418e 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -10,7 +10,7 @@ Training a model with PyTorch on ROCm
 
 .. note::
 
-   Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
+   Primus with the PyTorch torchtitan backend is designed to replace :doc:`ROCm PyTorch training <pytorch-training>` workflow.
    See :doc:`primus-pytorch` for details.
 
 PyTorch is an open-source machine learning framework that is widely used for
@@ -22,7 +22,7 @@ model training with GPU-optimized components for transformer-based models.
    {% set docker = dockers[0] %}
    The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
    (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   model on AMD Instinct MI325X and MI300X GPUs. It includes the following software components to accelerate
    training workloads:
 
    .. list-table::
@@ -41,7 +41,7 @@ model training with GPU-optimized components for transformer-based models.
 Supported models
 ================
 
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
 Some instructions, commands, and training recommendations in this documentation might
 vary by model -- select one to get started.
 
@@ -126,7 +126,7 @@ popular AI models.
    The performance data presented in
    `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
    should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
+   Instinct MI325X and MI300X GPUs or ROCm software.
 
 System validation
 =================
@@ -521,9 +521,14 @@ Run training
 
             For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
 
+.. _amd-pytorch-training-multinode-examples:
+
 Multi-node training
 -------------------
 
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
+
 Pre-training
 ~~~~~~~~~~~~
 
@@ -571,7 +576,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
 - For a list of other ready-made Docker images for AI with ROCm, see
   `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index d6af88b74..92f0534f9 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -60,8 +60,15 @@ subtrees:
     - entries:
       - file: how-to/rocm-for-ai/install.rst
         title: Installation
-      - file: how-to/rocm-for-ai/system-health-check.rst
-        title: System health benchmarks
+      - file: how-to/rocm-for-ai/system-setup/index.rst
+        title: System setup
+        entries:
+        - file: how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
+          title: System validation
+        - file: how-to/rocm-for-ai/system-setup/multi-node-setup.rst
+          title: Multi-node setup
+        - file: how-to/rocm-for-ai/system-setup/system-health-check.rst
+          title: System health benchmarks
       - file: how-to/rocm-for-ai/training/index.rst
         title: Training
         subtrees:

From 0ea5216ace626eccee5c02dbe5b1880e5d11509d Mon Sep 17 00:00:00 2001
From: peterjunpark <peter.park@amd.com>
Date: Wed, 1 Oct 2025 13:17:50 -0400
Subject: [PATCH 03/24] docs: update article_info in conf.py (#5454)

---
 docs/conf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 587bb46ba..78d50d502 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -114,7 +114,10 @@ article_pages = [
 
     {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
 
     {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},

From 52979e2fdbdb02340bf4d8c70c82a96cac80e107 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 25 Sep 2025 22:01:37 +0000
Subject: [PATCH 04/24] [Ex CI] Updated testDir for rp-systems tests

---
 .azuredevops/components/rocprofiler-systems.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 0840da028..471ddec00 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -250,6 +250,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
+          testDir: $(Agent.BuildDirectory)/s/build
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From b2e3bc8565eb2e7a48307ac492df3dbe56b20305 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 25 Sep 2025 22:52:31 +0000
Subject: [PATCH 05/24] [Ex CI] Updated rp-systems CMakeBuildDir

---
 .azuredevops/components/rocprofiler-systems.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 471ddec00..20f51020b 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,6 +226,8 @@ jobs:
             echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
         parameters:
+          cmakeBuildDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems/build
+          cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
     # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
           extraBuildFlags: >-
             -DROCPROFSYS_BUILD_TESTING=ON

From 6f7f73ac0b9be50c8fc4883933137ead1143c658 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 00:58:01 +0000
Subject: [PATCH 06/24] Update workingDirectories

---
 .azuredevops/components/rocprofiler-systems.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 20f51020b..7e1c2b24b 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,7 +226,7 @@ jobs:
             echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
         parameters:
-          cmakeBuildDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems/build
+          cmakeBuildDir: $(Agent.BuildDirectory)/s/build
           cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
     # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
           extraBuildFlags: >-
@@ -247,8 +247,8 @@ jobs:
         displayName: Set up rocprofiler-systems env
         inputs:
           targetType: inline
-          script: source share/rocprofiler-systems/setup-env.sh
-          workingDirectory: build
+          script: source $(Agent.BuildDirectory)/s/setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/s/build
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}

From 58790154b24d1b98f586972b9c08c8b87f4a945c Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 01:45:34 +0000
Subject: [PATCH 07/24] Add a script to look for setup-env.sh

---
 .azuredevops/components/rocprofiler-systems.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 7e1c2b24b..caadb321b 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -243,6 +243,12 @@ jobs:
             -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
             -DGPU_TARGETS=${{ job.target }}
             -GNinja
+      - task: Bash@3
+        displayName: Find setup-env.sh location
+        inputs:
+          targetType: inline
+          script: find $(Agent.BuildDirectory)/s -name setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:

From 41b52986599ba1496b99d2a3cd76b72425207ba3 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 02:25:19 +0000
Subject: [PATCH 08/24] Added a list for all rp-systems files

---
 .azuredevops/components/rocprofiler-systems.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index caadb321b..a784453ec 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,7 +226,7 @@ jobs:
             echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
         parameters:
-          cmakeBuildDir: $(Agent.BuildDirectory)/s/build
+          cmakeBuildDir: $(Agent.BuildDirectory)/s/rocm
           cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
     # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
           extraBuildFlags: >-
@@ -249,12 +249,18 @@ jobs:
           targetType: inline
           script: find $(Agent.BuildDirectory)/s -name setup-env.sh
           workingDirectory: $(Agent.BuildDirectory)/s
+      - task: Bash@3
+        displayName: List all files under rocprofiler-systems
+        inputs:
+          targetType: inline
+          script: ls -lR $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
+          workingDirectory: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:
           targetType: inline
-          script: source $(Agent.BuildDirectory)/s/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/s/build
+          script: source $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems/setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}

From e31841312bf8110a85af3b8a42dba74412e2862d Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 02:26:11 +0000
Subject: [PATCH 09/24] Update testDir

---
 .azuredevops/components/rocprofiler-systems.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index a784453ec..4eda1f351 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -264,7 +264,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build
+          testDir: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From ee93101541a520f6b0f103a7aa092affb8083f5e Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 02:59:45 +0000
Subject: [PATCH 10/24] Change list files

---
 .azuredevops/components/rocprofiler-systems.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 4eda1f351..bb152c54b 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -253,8 +253,8 @@ jobs:
         displayName: List all files under rocprofiler-systems
         inputs:
           targetType: inline
-          script: ls -lR $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
-          workingDirectory: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
+          script: ls -lR $(Agent.BuildDirectory)/s
+          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:

From 945fb286f71c54de20939d2a20ed16b5e9db578b Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 03:01:37 +0000
Subject: [PATCH 11/24] Find tests Task

---
 .azuredevops/components/rocprofiler-systems.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index bb152c54b..4b81ff128 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -255,6 +255,12 @@ jobs:
           targetType: inline
           script: ls -lR $(Agent.BuildDirectory)/s
           workingDirectory: $(Agent.BuildDirectory)/s
+      - task: Bash@3
+        displayName: Find tests
+        inputs:
+          targetType: inline
+          script: find $(Agent.BuildDirectory)/s -name '*test*'
+          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:

From 024cb4db7643cacbe255c7d3953f61b489622a57 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 03:58:47 +0000
Subject: [PATCH 12/24] Added testDir

---
 .azuredevops/components/rocprofiler-systems.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 4b81ff128..4080595aa 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -270,7 +270,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
+          testDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems/tests
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 774cb7a1b380c93f685e74c3eafb6b1e1de0f080 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 04:50:27 +0000
Subject: [PATCH 13/24] Changed testDir

---
 .azuredevops/components/rocprofiler-systems.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 4080595aa..462f5f060 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -270,7 +270,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems/tests
+          testDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 189c269350d597fbeba9ceaf22460742d1e3ee06 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 16:06:20 +0000
Subject: [PATCH 14/24] Added Debug

---
 .../components/rocprofiler-systems.yml        | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 462f5f060..adccc725d 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,10 +226,11 @@ jobs:
             echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
         parameters:
-          cmakeBuildDir: $(Agent.BuildDirectory)/s/rocm
           cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
     # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
           extraBuildFlags: >-
+            -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocprofiler-systems
+            -DROCPROFSYS_USE_PYTHON=ON
             -DROCPROFSYS_BUILD_TESTING=ON
             -DROCPROFSYS_BUILD_DYNINST=ON
             -DROCPROFSYS_BUILD_LIBUNWIND=ON
@@ -253,24 +254,30 @@ jobs:
         displayName: List all files under rocprofiler-systems
         inputs:
           targetType: inline
-          script: ls -lR $(Agent.BuildDirectory)/s
-          workingDirectory: $(Agent.BuildDirectory)/s
+          script: ls -lR $(Agent.BuildDirectory)/rocprofiler-systems
+          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems
       - task: Bash@3
         displayName: Find tests
         inputs:
           targetType: inline
           script: find $(Agent.BuildDirectory)/s -name '*test*'
           workingDirectory: $(Agent.BuildDirectory)/s
+      - task: Bash@3
+        displayName: Find tests under rocprofiler-systems
+        inputs:
+          targetType: inline
+          script: find $(Agent.BuildDirectory)/rocprofiler-systems -name '*test*'
+          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:
           targetType: inline
-          script: source $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/s/rocm/share/rocprofiler-systems
+          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
+          testDir: $(Agent.BuildDirectory)/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From d21ec9eea5ccf9ada0961d1521887be7219a8f46 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 16:58:26 +0000
Subject: [PATCH 15/24] Updated testDir

---
 .azuredevops/components/rocprofiler-systems.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index adccc725d..c1d8aa86b 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -277,7 +277,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocprofiler-systems
+          testDir: $(Agent.BuildDirectory)/s/rocm/tests
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From aca31170c45ba0802fa171d0e1c4402f6fd39ad7 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 17:53:52 +0000
Subject: [PATCH 16/24] Update setupenv

---
 .azuredevops/components/rocprofiler-systems.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index c1d8aa86b..73aa638e6 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -272,12 +272,12 @@ jobs:
         displayName: Set up rocprofiler-systems env
         inputs:
           targetType: inline
-          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
+          script: source $(Agent.BuildDirectory)/build/share/rocprofiler-systems/setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/build/share/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/rocm/tests
+          testDir: $(Agent.BuildDirectory)/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 0894547f5a1cb3ad8223aa54050272162f409d59 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 19:14:09 +0000
Subject: [PATCH 17/24] Update setupenv

---
 .azuredevops/components/rocprofiler-systems.yml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 73aa638e6..9d58cd8bf 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -262,22 +262,16 @@ jobs:
           targetType: inline
           script: find $(Agent.BuildDirectory)/s -name '*test*'
           workingDirectory: $(Agent.BuildDirectory)/s
-      - task: Bash@3
-        displayName: Find tests under rocprofiler-systems
-        inputs:
-          targetType: inline
-          script: find $(Agent.BuildDirectory)/rocprofiler-systems -name '*test*'
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:
           targetType: inline
-          script: source $(Agent.BuildDirectory)/build/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/build/share/rocprofiler-systems
+          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
+          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocprofiler-systems
+          testDir: $(Agent.BuildDirectory)/rocprofiler-systems/bin
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 38e659e5f01a2f0f6c4b1c45c3af9ef3bddfb145 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Fri, 26 Sep 2025 19:47:51 +0000
Subject: [PATCH 18/24] Update testDir

---
 .azuredevops/components/rocprofiler-systems.yml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index 9d58cd8bf..d2e2dbe83 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -244,18 +244,12 @@ jobs:
             -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
             -DGPU_TARGETS=${{ job.target }}
             -GNinja
-      - task: Bash@3
-        displayName: Find setup-env.sh location
-        inputs:
-          targetType: inline
-          script: find $(Agent.BuildDirectory)/s -name setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: List all files under rocprofiler-systems
         inputs:
           targetType: inline
-          script: ls -lR $(Agent.BuildDirectory)/rocprofiler-systems
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems
+          script: ls -lR $(Agent.BuildDirectory)/s
+          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: Find tests
         inputs:
@@ -271,7 +265,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocprofiler-systems/bin
+          testDir: $(Agent.BuildDirectory)/rocprofiler-systems/
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 6b8b359d03255da2be3473d9943df64f50537c97 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 30 Sep 2025 22:41:47 +0000
Subject: [PATCH 19/24] Updated test dir to s/build/tests

---
 .azuredevops/components/rocprofiler-systems.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index d2e2dbe83..c84bb6bf5 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -265,7 +265,8 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocprofiler-systems/
+          testDir: $(Agent.BuildDirectory)/s/build/tests/
+          testParameters: '--output-on-failure'
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
         parameters:
           gpuTarget: ${{ job.target }}

From 5e6b66ca397434f1034e707f66907be4231070c9 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 1 Oct 2025 17:09:53 +0000
Subject: [PATCH 20/24] Remove tasks to locate test dir

---
 .azuredevops/components/rocprofiler-systems.yml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-systems.yml b/.azuredevops/components/rocprofiler-systems.yml
index c84bb6bf5..12231af71 100644
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -244,18 +244,6 @@ jobs:
             -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
             -DGPU_TARGETS=${{ job.target }}
             -GNinja
-      - task: Bash@3
-        displayName: List all files under rocprofiler-systems
-        inputs:
-          targetType: inline
-          script: ls -lR $(Agent.BuildDirectory)/s
-          workingDirectory: $(Agent.BuildDirectory)/s
-      - task: Bash@3
-        displayName: Find tests
-        inputs:
-          targetType: inline
-          script: find $(Agent.BuildDirectory)/s -name '*test*'
-          workingDirectory: $(Agent.BuildDirectory)/s
       - task: Bash@3
         displayName: Set up rocprofiler-systems env
         inputs:

From f91c2b9b4a92303c3e3d02f2160cb82fd467d4a0 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 1 Oct 2025 15:15:30 -0400
Subject: [PATCH 21/24] Update dependencies-rocm.yml

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 359198dde..620751261 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -227,8 +227,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: true
     rocprofiler-systems:
-      pipelineId: 255
-      developBranch: amd-staging
+      pipelineId: 345
+      developBranch: develop
       hasGpuTarget: true
     rocPyDecode:
       pipelineId: 239

From 93c6d17922f32985df7f77eeb543a1c1e458b3e4 Mon Sep 17 00:00:00 2001
From: anisha-amd <anisha.sankar@amd.com>
Date: Thu, 2 Oct 2025 13:51:36 -0400
Subject: [PATCH 22/24] Docs: frameworks 25.09 - compatibility - FlashInfer and
 llama.cpp (#5462)

---
 .wordlist.txt                                 |   3 +
 .../compatibility-matrix-historical-6.0.csv   |   5 +-
 docs/compatibility/compatibility-matrix.rst   |   5 +-
 .../flashinfer-compatibility.rst              | 107 +++++++++++++
 .../llama-cpp-compatibility.rst               | 143 ++++++++++++++++--
 .../megablocks-compatibility.rst              |   2 +-
 .../stanford-megatron-lm-compatibility.rst    |   2 +-
 docs/conf.py                                  |   1 +
 docs/how-to/deep-learning-rocm.rst            |  12 ++
 docs/sphinx/_toc.yml.in                       |   2 +
 10 files changed, 265 insertions(+), 17 deletions(-)
 create mode 100644 docs/compatibility/ml-compatibility/flashinfer-compatibility.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 70cdba47a..6d0e2d49e 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -147,6 +147,8 @@ Filesystem
 FindDb
 Flang
 FlashAttention
+FlashInfer’s
+FlashInfer
 FluxBenchmark
 Fortran
 Fuyu
@@ -481,6 +483,7 @@ TCI
 TCIU
 TCP
 TCR
+TVM
 THREADGROUPS
 threadgroups
 TensorRT
diff --git a/docs/compatibility/compatibility-matrix-historical-6.0.csv b/docs/compatibility/compatibility-matrix-historical-6.0.csv
index 696ae3b6d..5c2462234 100644
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -38,8 +38,9 @@ ROCm Version,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6
       :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
       :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
       :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
       `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,,
       ,,,,,,,,,,,,,,,,,,,
diff --git a/docs/compatibility/compatibility-matrix.rst b/docs/compatibility/compatibility-matrix.rst
index abcf6e05e..ff4c90a1d 100644
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -60,6 +60,7 @@ compatibility and system requirements.
       :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.31
       :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,85f95ae
       :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,0.7.0
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,b6356,b6356,N/A
       `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.17.3
       ,,,
       THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
@@ -175,6 +176,7 @@ compatibility and system requirements.
 .. [#7700XT-OS] **Prior ROCm 7.0.0** - Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
 .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
 .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
+.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
 .. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
 .. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
@@ -282,7 +284,8 @@ Expand for full historical view of:
    .. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
    .. [#taichi_compat-past-60] Taichi is only supported on ROCm 6.3.2.
    .. [#ray_compat-past-60] Ray is only supported on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 6.4.0.
+   .. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
+   .. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 6.4.1.
    .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
    .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
    .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
diff --git a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
new file mode 100644
index 000000000..45ecc6a75
--- /dev/null
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -0,0 +1,107 @@
+:orphan:
+
+.. meta::
+    :description: FlashInfer deep learning framework compatibility
+    :keywords: GPU, LLM, FlashInfer, compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+FlashInfer compatibility
+********************************************************************************
+
+`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
+for Large Language Models (LLMs) that provides high-performance implementation of graphics 
+processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
+as advanced performance across diverse scenarios.
+
+FlashInfer features highly efficient attention kernels, load-balanced scheduling, and memory-optimized 
+techniques, while supporting customized attention variants. It’s compatible with ``torch.compile``, and 
+offers high-performance LLM-specific operators, with easy integration through PyTorch, and C++ APIs.
+
+.. note::
+
+  The ROCm port of FlashInfer is under active development, and some features are not yet available. 
+  For the latest feature compatibility matrix, refer to the ``README`` of the 
+  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
+
+Support for the ROCm port of FlashInfer is available as follows:
+
+- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer 
+  <https://github.com/ROCm/flashinfer>`__ repository. This location differs from the 
+  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_ 
+  upstream repository.
+
+- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`, 
+  which includes ROCm, FlashInfer, and all required dependencies.
+
+  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
+    to install and get started.
+
+  - See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
+    in the upstream FlashInfer documentation.
+
+.. note::
+
+  Flashinfer is supported on ROCm 6.4.1.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X
+
+
+.. _flashinfer-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
+In the decode phase, tokens are generated sequentially, with the model predicting each new 
+token based on the previously generated tokens and the input context.
+
+FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
+attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
+
+Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
+also implements cascade attention from upstream to reduce memory usage. 
+
+For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _flashinfer-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the FlashInfer version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - FlashInfer
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5_rocm6.4_ubuntu24.04_py3.12_pytorch2.7/images/sha256-558914838821c88c557fb6d42cfbc1bdb67d79d19759f37c764a9ee801f93313"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
+      - `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
+      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
+
+
diff --git a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
index 1ae246931..902c61a2a 100644
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -16,7 +16,7 @@ for Large Language Model (LLM) inference that runs on both central processing un
 a simple, dependency-free setup. 
 
 The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
-to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
+to accelerate inference and reduce memory usage. Originally built as a CPU-first library, 
 llama.cpp is easy to integrate with other programming environments and is widely 
 adopted across diverse platforms, including consumer devices. 
 
@@ -40,12 +40,12 @@ with ROCm support:
 
 .. note::
 
-  llama.cpp is supported on ROCm 6.4.0.
+  llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
 
 Supported devices
 ================================================================================
 
-**Officially Supported**: AMD Instinct™ MI300X, MI210
+**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
 
 
 Use cases and recommendations
@@ -70,7 +70,7 @@ llama.cpp is also used in a range of real-world applications, including:
 For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
 
-- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
   blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
   server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
   AMD Instinct GPUs within the ROCm ecosystem. 
@@ -84,9 +84,9 @@ Docker image compatibility
 
    <i class="fab fa-docker"></i>
 
-AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+inventories represent the available llama.cpp versions from the official Docker Hub.
 Click |docker-icon| to view the image on Docker Hub.
 
 .. important::
@@ -105,8 +105,115 @@ Click |docker-icon| to view the image on Docker Hub.
       - Server Docker
       - Light Docker
       - llama.cpp
+      - ROCm
       - Ubuntu
 
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - 24.04
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - 22.04
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_full/images/sha256-5960fc850024a8a76451f9eaadd89b7e59981ae9f393b407310c1ddf18892577"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_server/images/sha256-1b79775d9f546065a6aaf9ca426e1dd4ed4de0b8f6ee83687758cc05af6538e6"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_light/images/sha256-8f863c4c2857ae42bebd64e4f1a0a1e7cc3ec4503f243e32b4a4dcad070ec361"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
+      - 24.04
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_full/images/sha256-888879b3ee208f9247076d7984524b8d1701ac72611689e89854a1588bec9867"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_server/images/sha256-90e4ff99a66743e33fd00728cd71a768588e5f5ef355aaa196669fe65ac70672"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_light/images/sha256-bd447a049939cb99054f8fbf3f2352870fe906a75e2dc3339c845c08b9c53f9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
+      - 22.04
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_full/images/sha256-5b3a1bc4889c1fcade434b937fbf9cc1c22ff7dc0317c130339b0c9238bc88c4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_server/images/sha256-5228ff99d0f627a9032d668f4381b2e80dc1e301adc3e0821f26d8354b175271"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_light/images/sha256-b12723b332a826a89b7252dddf868cbe4d1a869562fc4aa4032f59e1a683b968"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
+      - 24.04
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_full/images/sha256-cd6e21a6a73f59b35dd5309b09dd77654a94d783bf13a55c14eb8dbf8e9c2615"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_server/images/sha256-c2b4689ab2c47e6626e8fea22d7a63eb03d47c0fde9f5ef8c9f158d15c423e58"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_light/images/sha256-1acc28f29ed87db9cbda629cb29e1989b8219884afe05f9105522be929e94da4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
+      - 22.04
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_full/images/sha256-2f8ae8a44510d96d52dea6cb398b224f7edeb7802df7ec488c6f63d206b3cdc9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_server/images/sha256-fece497ff9f4a28b12f645de52766941da8ead8471aa1ea84b61d4b4568e51f2"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_light/images/sha256-3e14352fa6f8c6128b23cf9342531c20dbfb522550b626e09d83b260a1947022"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
+      - 24.04
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_full/images/sha256-80763062ef0bec15038c35fd01267f1fc99a5dd171d4b48583cc668b15efad69"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_server/images/sha256-db2a6c957555ed83b819bbc54aea884a93192da0fb512dae63d32e0dc4e8ab8f"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_light/images/sha256-c6dbb07cc655fb079d5216e4b77451cb64a9daa0585d23b6fb8b32cb22021197"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
+      - 22.04
+
     * - .. raw:: html
 
            <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
@@ -117,40 +224,52 @@ Click |docker-icon| to view the image on Docker Hub.
 
            <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
       - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
       - 24.04
 
+
 Key ROCm libraries for llama.cpp
 ================================================================================
 
 llama.cpp functionality on ROCm is determined by its underlying library
 dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers.
+feature set available to developers. Ensure you have the required libraries for 
+your corresponding ROCm version.
 
 .. list-table::
     :header-rows: 1
 
     * - ROCm library
-      - Version
+      - ROCm 7.0.0 version
+      - ROCm 6.4.x version
       - Purpose
       - Usage
     * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
-      - :version-ref:`hipBLAS rocm_version`
+      - 3.0.0
+      - 2.4.0
       - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
         matrix and vector operations.
       - Supports operations such as matrix multiplication, matrix-vector
         products, and tensor contractions. Utilized in both dense and batched
         linear algebra operations.
     * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-      - :version-ref:`hipBLASLt rocm_version`
+      - 1.0.0
+      - 0.12.0
       - hipBLASLt is an extension of the hipBLAS library, providing additional
         features like epilogues fused into the matrix multiplication kernel or
         use of integer tensor cores.
       - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
         kernels where possible.
     * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
-      - :version-ref:`rocWMMA rocm_version`
+      - 2.0.0
+      - 1.7.0
       - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
         multiplication (GEMM) and accumulation operations with mixed precision
         support.
       - Can be used to enhance the flash attention performance on AMD compute, by enabling
-        the flag during compile time.
\ No newline at end of file
+        the flag during compile time.
+
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
+of the ``ROCm/llama.cpp`` Docker image.
\ No newline at end of file
diff --git a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
index 234dc82fc..50c2c3821 100644
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -28,7 +28,7 @@ Supported devices
 ================================================================================
 
 - **Officially Supported**: AMD Instinct MI300X
-- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
 
 Supported models and features
 ================================================================================
diff --git a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
index e8f1b4195..1550a82d1 100644
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -27,7 +27,7 @@ Supported Devices
 ================================================================================
 
 - **Officially Supported**: AMD Instinct MI300X
-- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
 
 
 Supported models and features
diff --git a/docs/conf.py b/docs/conf.py
index 78d50d502..760e3326c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -110,6 +110,7 @@ article_pages = [
     {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
     {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
     {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
     {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
 
     {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
diff --git a/docs/how-to/deep-learning-rocm.rst b/docs/how-to/deep-learning-rocm.rst
index accb2e546..fb21328f8 100644
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -128,10 +128,22 @@ The table below summarizes information about ROCm-enabled deep learning framewor
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
         - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#build-your-own-docker-image>`__
       - .. raw:: html
 
           <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
 
+    * - `FlashInfer <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#use-a-prebuilt-docker-image-with-flashinfer-pre-installed>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#build-your-own-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/flashinfer"><i class="fab fa-github fa-lg"></i></a>
+
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
 
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 92f0534f9..bfaef7ffe 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -49,6 +49,8 @@ subtrees:
         title: Ray compatibility
       - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
         title: llama.cpp compatibility
+      - file: compatibility/ml-compatibility/flashinfer-compatibility.rst
+        title: FlashInfer compatibility
   - file: how-to/build-rocm.rst
     title: Build ROCm from source
 

From eeea0d2180fabeb0da01ee27db9fab8cda2824f5 Mon Sep 17 00:00:00 2001
From: peterjunpark <peter.park@amd.com>
Date: Fri, 3 Oct 2025 13:33:14 -0400
Subject: [PATCH 23/24] Fix heading levels in pages using embedded templates
 (#5468)

---
 .../inference/benchmark-docker/vllm.rst       | 16 ++++++++-----
 .../training/benchmark-docker/jax-maxtext.rst | 24 ++++++++++++++++++-
 .../benchmark-docker/primus-megatron.rst      | 15 ++++++++----
 .../benchmark-docker/primus-pytorch.rst       | 13 ++++++----
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
index 70121c67f..38a5f8200 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -138,13 +138,12 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 
+Pull the Docker image
+=====================
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
 
    {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Pull the Docker image
-   =====================
 
    Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
    Use the following command to pull the Docker image from Docker Hub.
@@ -153,8 +152,13 @@ system's configuration.
 
       docker pull {{ docker.pull_tag }}
 
-   Benchmarking
-   ============
+Benchmarking
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
 
    Once the setup is complete, choose between two options to reproduce the
    benchmark results:
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
index eb56f4dce..eec785b7b 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -25,7 +25,7 @@ It includes the following software components:
       {% for docker in dockers %}
       {% set jax_version = docker.components["JAX"] %}
 
-      .. tab-item:: JAX {{ jax_version }}
+      .. tab-item:: ``{{ docker.pull_tag }}``
          :sync: {{ docker.pull_tag }}
 
          .. list-table::
@@ -132,6 +132,28 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.
 
+Pull the Docker image
+---------------------
+
+Use the following command to pull the Docker image from Docker Hub.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}
+
+         .. code-block:: shell
+
+            docker pull {{ docker.pull_tag }}
+
+      {% endfor %}
+
 .. _amd-maxtext-multi-node-setup-v257:
 
 Multi-node configuration
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
index 853d24395..65ac5e50c 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -105,21 +105,26 @@ system's configuration.
 
 .. _mi300x-amd-primus-megatron-lm-training:
 
+Environment setup
+=================
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
 
    {% set dockers = data.dockers %}
       {% set docker = dockers[0] %}
 
-   Environment setup
-   =================
-
    Use the following instructions to set up the environment, configure the script to train models, and
    reproduce the benchmark results on MI300X series GPUs with the ``{{ docker.pull_tag }}`` image.
 
    .. _amd-primus-megatron-lm-requirements:
 
-   Download the Docker image
-   -------------------------
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+      {% set docker = dockers[0] %}
 
    1. Use the following command to pull the Docker image from Docker Hub.
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
index b2bd5fb87..5c99776ee 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -104,22 +104,25 @@ This Docker image is optimized for specific model configurations outlined
 below. Performance can vary for other training workloads, as AMD
 doesn’t test configurations and run conditions outside those described.
 
+Pull the Docker image
+=====================
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
 
    {% set unified_docker = data.dockers[0] %}
 
-   Pull the Docker image
-   =====================
-
    Use the following command to pull the `Docker image <{{ unified_docker.docker_hub_url }}>`_ from Docker Hub.
 
    .. code-block:: shell
 
       docker pull {{ unified_docker.pull_tag }}
 
-   Run training
-   ============
+Run training
+============
 
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   {% set unified_docker = data.dockers[0] %}
    {% set model_groups = data.model_groups %}
 
    Once the setup is complete, choose between the following two workflows to start benchmarking training.

From f20edab8fc7e30cee00694b3727d050e753daf3c Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 7 Oct 2025 18:44:21 +0000
Subject: [PATCH 24/24] [Ex CI] Update CMake Flags for hipTensor

---
 .azuredevops/components/hipTensor.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azuredevops/components/hipTensor.yml b/.azuredevops/components/hipTensor.yml
index 68fe794b4..30db323a2 100644
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -77,6 +77,7 @@ jobs:
         extraBuildFlags: >-
           -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
           -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
           -DROCM_PATH=$(Agent.BuildDirectory)/rocm
           -DCMAKE_BUILD_TYPE=Release
           -DHIPTENSOR_BUILD_TESTS=ON