Fix compatibility list

2026-01-13 08:38:04 -05:00 · 2025-05-13 16:02:55 +02:00
38 changed files with 94 additions and 284 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -77,8 +77,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      cmakeBuildDir: 'clr/build'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=amd
@@ -139,8 +138,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      cmakeBuildDir: 'clr/build'
      extraBuildFlags: >-
        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
        -DHIP_PLATFORM=nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -73,7 +73,6 @@ jobs:
    parameters:
      componentName: upstream-llvm
      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
-      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
      installDir: $(Pipeline.Workspace)/llvm
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -118,7 +118,6 @@ jobs:
    parameters:
      componentName: extras
      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
@@ -130,7 +129,6 @@ jobs:
    parameters:
      componentName: openmp
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
@@ -157,7 +155,6 @@ jobs:
    parameters:
      componentName: offload
      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
      installDir: '$(Build.BinariesDirectory)/llvm'
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -92,8 +92,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: external
-        cmakeBuildDir: '$(Build.SourcesDirectory)/deps/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/deps'
+        cmakeBuildDir: 'deps/build'
        installDir: '$(Pipeline.Workspace)/deps-install'
        extraBuildFlags: >-
          -DBUILD_BOOST=OFF
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -83,8 +83,7 @@ jobs:
        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
        -GNinja
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
+      cmakeBuildDir: 'llvm/build'
      installDir: '$(Build.BinariesDirectory)/llvm'
 # use llvm-lit to run unit tests for llvm, clang, and lld
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
@@ -122,8 +121,7 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
+      cmakeBuildDir: 'amd/device-libs/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      componentName: comgr
@@ -131,8 +129,7 @@ jobs:
        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
        -DCOMGR_DISABLE_SPIRV=1
        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
+      cmakeBuildDir: 'amd/comgr/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: comgr
@@ -145,8 +142,7 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
+      cmakeBuildDir: 'amd/hipcc/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -105,7 +105,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
-        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
        extraBuildFlags: >-
          -DgRPC_INSTALL=ON
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -125,7 +125,6 @@ jobs:
      parameters:
        componentName: PyBind11
        cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/pybind11'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -142,7 +141,6 @@ jobs:
      parameters:
        componentName: RapidJSON
        cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/rapidjson'
        customInstallPath: false
        installEnabled: false
        extraBuildFlags: >-
@@ -202,6 +200,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm/include/rocal
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -108,6 +108,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -114,6 +114,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -5,12 +5,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
- name: sparseCheckout
-  type: boolean
-  default: false
- name: sparseCheckoutDir
-  type: string
-  default: ''
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -72,8 +66,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckout: ${{ parameters.sparseCheckout }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -168,6 +168,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -105,7 +105,6 @@ jobs:
          -DLAPACKE=OFF
          -GNinja
        cmakeBuildDir: '$(Build.SourcesDirectory)/lapack/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/lapack'
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -167,6 +167,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    pool:
      name: ${{ job.target }}_test_pool
+      demands: firstRenderDeviceAccess
    workspace:
      clean: all
    steps:
--- a/.azuredevops/dependencies/grpc.yml
+++ b/.azuredevops/dependencies/grpc.yml
@@ -38,7 +38,6 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/grpc/build
-      cmakeSourceDir: $(Agent.BuildDirectory)/grpc
      extraBuildFlags: >-
        -DgRPC_INSTALL=ON
        -DgRPC_BUILD_TESTS=OFF
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -38,7 +38,6 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
-      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
      extraBuildFlags: >-
        -DGTEST_FORCE_SHARED_CRT=ON
        -DCMAKE_DEBUG_POSTFIX=d
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -10,10 +10,10 @@ parameters:
  default: ''
 - name: cmakeBuildDir
  type: string
-  default: $(Agent.BuildDirectory)/s/build
+  default: 'build'
 - name: cmakeSourceDir
  type: string
-  default: $(Agent.BuildDirectory)/s
+  default: '..'
 - name: customBuildTarget
  type: string
  default: ''
@@ -46,7 +46,7 @@ steps:
    ${{ if eq(parameters.customInstallPath, true) }}:
      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
    ${{ else }}:
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,12 +4,6 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
- name: sparseCheckout
-  type: boolean
-  default: false
- name: sparseCheckoutDir
-  type: string
-  default: ''
 # submodule download behaviour
 # change to 'recursive' for repos with submodules
 - name: submoduleBehaviour
@@ -21,13 +15,3 @@ steps:
    clean: true
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
-    fetchFilter: blob:none
-    ${{ if eq(parameters.sparseCheckout, true) }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
-      path: sparse
-  - ${{ if eq(parameters.sparseCheckout, true) }}:
-    - task: Bash@3
-      displayName: Symlink sparse checkout
-      inputs:
-        targetType: inline
-        script: ln -s $(Agent.BuildDirectory)/sparse/${{ parameters.sparseCheckoutDir }} $(Agent.BuildDirectory)/s
--- a/.azuredevops/templates/steps/docker-container.yml
+++ b/.azuredevops/templates/steps/docker-container.yml
@@ -106,7 +106,6 @@ parameters:
  type: object
  default:
    - gfx90a
-    - gfx942

 steps:
 # these steps should only be run if there was a failure or warning
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -34,7 +34,6 @@ Autocast
 BARs
 BLAS
 BMC
-BabelStream
 Blit
 Blockwise
 Bluefield
@@ -139,7 +138,6 @@ GDR
 GDS
 GEMM
 GEMMs
-GFLOPS
 GFortran
 GFXIP
 Gemma
@@ -643,7 +641,6 @@ hipSPARSELt
 hipTensor
 hipamd
 hipblas
-hipcc
 hipcub
 hipfft
 hipfort
--- a/README.md
+++ b/README.md
@@ -127,7 +127,6 @@ bash install-prerequisites.sh
 export GPU_ARCHS="gfx942"               # Example
 export GPU_ARCHS="gfx940;gfx941;gfx942" # Example

-cd ~/WORKSPACE/
 # Pick and run build commands in the docker container:
 # Build rocm-dev packages
 make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -496,7 +496,7 @@ Modules for JAX extensions.
      - 5.5.0

 Unsupported JAX features
-===============================================================================
+--------------------------------------------------------------------------------

 The following GPU-accelerated JAX features are not supported by ROCm for
 the listed supported JAX versions.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,8 +51,6 @@ article_pages = [
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
@@ -69,6 +67,7 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/install", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
--- a/docs/how-to/rocm-for-ai/inference/install.rst
+++ b/docs/how-to/rocm-for-ai/inference/install.rst
@@ -30,7 +30,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`

-* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
+* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.

 .. grid:: 1

@@ -59,8 +59,4 @@ images with the framework pre-installed.

 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

-Next steps
-==========
-
-After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
-to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
+The sections that follow in :doc:`Training a model <../training/train-a-model>` are geared for a ROCm with PyTorch installation.
--- a/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
+++ b/docs/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.rst
@@ -62,52 +62,47 @@ PyTorch inference performance testing
      {% endfor %}
   {% endfor %}

-   System validation
-   =================
+   Getting started
+   ===============

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X series accelerator with the prebuilt PyTorch Docker image.

-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   .. _pytorch-benchmark-get-started:

-   .. code-block:: shell
+   1. Disable NUMA auto-balancing.

-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+      .. code-block:: shell

-   Pull the Docker image
-   =====================
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0

   .. container:: model-doc pyt_chai1_inference

-      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue
+            docker pull rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue

-      .. note::
+         .. note::

-         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
+            The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

   .. container:: model-doc pyt_clip_inference

-      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
+      2. Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch:latest
-
-   .. _pytorch-benchmark-get-started:
+            docker pull rocm/pytorch:latest

   Benchmarking
   ============
--- a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
+++ b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
@@ -111,37 +111,35 @@ vLLM inference performance testing
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

-   System validation
-   =================
+   Getting started
+   ===============

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X accelerator with the prebuilt vLLM Docker image.

-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   .. _vllm-benchmark-get-started:

-   .. code-block:: shell
+   1. Disable NUMA auto-balancing.

-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+      .. code-block:: shell

-   Pull the Docker image
-   =====================
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0

-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
+   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.

-   .. code-block:: shell
+      Use the following command to pull the Docker image from Docker Hub.

-      docker pull {{ unified_docker.pull_tag }}
+      .. code-block:: shell
+
+         docker pull {{ unified_docker.pull_tag }}

   Benchmarking
   ============
--- a/docs/how-to/rocm-for-ai/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-health-check.rst
@@ -1,104 +0,0 @@
-.. meta::
-   :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
-   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
-
-.. _rocm-for-ai-system-health-bench:
-
-************************
-System health benchmarks
-************************
-
-Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
-
-ROCm Validation Suite (RVS) tests
-=================================
-
-RVS provides a collection of tests, benchmarks, and qualification tools, each
-targeting a specific subsystem of the system under test. It includes tests for
-GPU stress and memory bandwidth.
-
-.. _healthcheck-install-rvs:
-
-Install ROCm Validation Suite
-----------------------------
-
-To get started, install RVS. For example, on an Ubuntu system with ROCm already
-installed, run the following command:
-
-.. code-block:: shell
-
-   sudo apt update
-   sudo apt install rocm-validation-suite
-
-See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
-and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
-in the Instinct documentation for more detailed instructions.
-
-Benchmark, stress, and qualification tests
------------------------------------------
-
-The GPU stress test runs various GEMM computations as workloads to stress the GPU FLOPS performance and check whether it
-meets the configured target GFLOPS.
-
-Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
-section of the Instinct documentation for usage instructions.
-
-BabelStream test
----------------
-
-BabelStream is a synthetic GPU benchmark based on the STREAM benchmark for
-CPUs, measuring memory transfer rates to and from global device memory.
-BabelStream tests are included with the RVS package as part of the `BABEL module
-<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
-
-For more information, see `Performance benchmarking
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
-in the Instinct documentation.
-
-RCCL tests
-==========
-
-The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
-communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
-the performance and verifies the correctness of these collective operations.
-This helps ensure optimal scaling for multi-accelerator tasks.
-
-1. To get started, build RCCL-tests using the official instructions in the README at
-   `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
-   following commands:
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/rccl-tests.git
-      cd rccl-tests
-      make
-
-2. Run the suggested RCCL tests -- see `RCCL benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
-   in the Instinct performance benchmarking documentation for instructions.
-
-TransferBench test
-==================
-
-TransferBench is a standalone utility for benchmarking simultaneous data
-transfer performance between various devices in the system, including
-CPU-to-GPU and GPU-to-GPU (peer-to-peer). This helps identify potential
-bottlenecks in data movement between the host system and the GPUs, or between
-GPUs, which can impact end-to-end latency.
-
-.. _healthcheck-install-transferbench:
-
-1. To get started, use the instructions in the `TransferBench documentation
-   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
-   or use the following commands:
-
-   .. code:: shell
-
-      git clone https://github.com/ROCm/TransferBench.git
-      cd TransferBench
-      CC=hipcc make
-
-2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
-   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -79,18 +79,11 @@ across different input sequences. Support for packed input format is planned for
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 Environment setup
 =================

@@ -182,8 +175,8 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.

 .. _amd-maxtext-download-docker:

-Pull the Docker image
---------------------
+Download the Docker image
+-------------------------

 1. Use the following command to pull the Docker image from Docker Hub.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -103,18 +103,11 @@ popular AI models.
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 .. _mi300x-amd-megatron-lm-training:

 Environment setup
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -34,18 +34,11 @@ for MPT-30B with access to detailed logs and performance metrics.
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 Getting started
 ===============

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -77,18 +77,11 @@ popular AI models.
 System validation
 =================

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
 This Docker image is optimized for specific model configurations outlined
 below. Performance can vary for other training workloads, as AMD 
 doesn’t validate configurations and run conditions outside those described.
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,12 +21,8 @@ In this guide, you'll learn about:

 - Training a model

-  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
+  - :doc:`Train a model with Megatron-LM <benchmark-docker/megatron-lm>`

-  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
-
-  - :doc:`With JAX MaxText <benchmark-docker/jax-maxtext>`
-
-  - :doc:`With LLM Foundry <benchmark-docker/mpt-llm-foundry>`
+  - :doc:`Train a model with PyTorch <benchmark-docker/pytorch-training>`

 - :doc:`Scaling model training <scale-model-training>`
--- a/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
+++ b/docs/how-to/rocm-for-ai/training/prerequisite-system-validation.rst
@@ -5,13 +5,12 @@
   :keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax

 .. _train-a-model-system-validation:
-.. _rocm-for-ai-system-optimization:

-**********************************************************
-Prerequisite system validation before running AI workloads
-**********************************************************
+**********************************************
+Prerequisite system validation before training
+**********************************************

-Complete the following system validation and optimization steps to set up your system before starting training and inference.
+Complete the following system validation and optimization steps to set up your system before starting training.

 Disable NUMA auto-balancing
 ---------------------------
@@ -27,8 +26,7 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See `Disable NUMA auto-balancing <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_
-in the Instinct documentation for more information.
+See :ref:`mi300x-disable-numa` for more information.

 Hardware verification with ROCm
 -------------------------------
@@ -44,8 +42,7 @@ Run the command:

   rocm-smi --setperfdeterminism 1900

-See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
-in the Instinct documentation for more information.
+See :ref:`mi300x-hardware-verification-with-rocm` for more information.

 RCCL Bandwidth Test for multi-node setups
 -----------------------------------------
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -36,10 +36,6 @@ subtrees:
    title: Use ROCm for AI
    subtrees:
    - entries:
-      - file: how-to/rocm-for-ai/install.rst
-        title: Installation
-      - file: how-to/rocm-for-ai/system-health-check.rst
-        title: System health benchmarks
      - file: how-to/rocm-for-ai/training/index.rst
        title: Training
        subtrees:
@@ -74,6 +70,8 @@ subtrees:
        title: Inference
        subtrees:
        - entries:
+          - file: how-to/rocm-for-ai/inference/install.rst
+            title: Installation
          - file: how-to/rocm-for-ai/inference/hugging-face-models.rst
            title: Run models from Hugging Face
          - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
--- a/tools/rocm-build/ROCm.mk
+++ b/tools/rocm-build/ROCm.mk
@@ -115,7 +115,7 @@ $(call adddep,roctracer,${ASAN_DEP} rocr hip_on_rocclr)


 # rocm-dev points to all possible last finish components of Stage1 build.
-rocm-dev-components :=amd_smi_lib aqlprofile comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
+rocm-dev-components :=amd_smi_lib aqlprofile aqlprofiletest comgr dbgapi devicelibs hip_on_rocclr hipcc hipify_clang \
 	lightning rocprofiler-compute opencl_on_rocclr openmp_extras rocm_bandwidth_test rocm_smi_lib \
 	rocm-cmake rocm-core rocm-gdb rocminfo rocprofiler-register rocprofiler-sdk rocprofiler-systems \
 	rocprofiler rocr rocr_debug_agent rocrsamples roctracer
--- a/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
+++ b/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
--- a/tools/rocm-build/docker/ubuntu22/packages
+++ b/tools/rocm-build/docker/ubuntu22/packages
@@ -60,6 +60,7 @@ libfile-find-rule-perl
 libgflags-dev
 libglew-dev
 libgmp-dev
+libgoogle-glog-dev
 libgtk2.0-dev
 libhdf5-serial-dev
 libjpeg-dev
@@ -89,6 +90,7 @@ libsuitesparse-dev
 libsystemd-dev
 libtinfo-dev
 libtool
+libunwind-dev
 liburi-encode-perl
 libva-dev
 libvirt-clients
@@ -96,6 +98,7 @@ libvirt-daemon-system
 libyaml-cpp-dev
 libzstd-dev
 llvm
+llvm-6.0-dev
 llvm-dev
 llvm-runtime
 mesa-common-dev
@@ -109,7 +112,8 @@ pigz
 pkg-config
 protobuf-compiler
 python-is-python3
-python3-pip-whl
+python-pip-whl
+python-yaml
 python3-dev
 python3-pip
 python3-venv
--- a/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
+++ b/tools/rocm-build/docker/ubuntu24/install-prerequisites.sh
@@ -17,7 +17,7 @@ git --version

 # venv for python to be able to run pip3 without --break-system-packages
 python3 -m venv /opt/venv
-source /opt/venv/bin/activate
+
 pip3 install CppHeaderParser argparse lxml recommonmark jinja2==3.0.0 \
    websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas \
    myst-parser setuptools lit
--- a/tools/rocm-build/envsetup.sh
+++ b/tools/rocm-build/envsetup.sh
@@ -217,7 +217,7 @@ export RCCL_ROOT=$WORK_ROOT/rccl
 export ROCM_DBGAPI_ROOT=$WORK_ROOT/ROCdbgapi
 export ROCM_GDB_ROOT=$WORK_ROOT/ROCgdb
 # export ROCclr_ROOT=$WORK_ROOT/vdi
-export HIP_ON_ROCclr_ROOT=$WORK_ROOT/hip
+export HIP_ON_ROCclr_ROOT=$WORK_ROOT/HIP
 export HIPAMD_ROOT=$WORK_ROOT/hipamd
 export HIP_CATCH_TESTS_ROOT=$WORK_ROOT/hip-tests
 # export OPENCL_ON_ROCclr_ROOT=$WORK_ROOT/opencl-on-vdi