Fix broken link for AMDGPU installer (#4989 )

KMD UMD support footnote update ROCm 633 (#4972 )
[docs/6.3.3] Link to specific ROCm/vLLM readme in inference/vllm-benchmark.rst (#4922 )
2026-01-12 08:08:03 -05:00 · 2025-07-02 10:05:48 -04:00 · 2025-06-26 15:33:59 -04:00 · 2025-06-13 13:49:18 -04:00 · 2025-05-06 12:01:45 -04:00 · 2025-04-17 11:50:05 -04:00
96 changed files with 2131 additions and 3922 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -84,8 +84,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -67,8 +67,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -77,8 +77,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -67,8 +67,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -87,6 +87,7 @@ jobs:
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
+      gpuTarget: $(JOB_GPU_TARGET)

 - job: Tensile_testing
  timeoutInMinutes: 90
--- a/.azuredevops/components/TransferBench.yml
+++ b/.azuredevops/components/TransferBench.yml
@@ -42,8 +42,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -48,8 +48,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -52,8 +52,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -63,8 +63,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -72,8 +72,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -158,7 +156,6 @@ jobs:
        - deps

 - job: hipBLASLt_testing
-  timeoutInMinutes: 120
  dependsOn: hipBLASLt
  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -43,8 +43,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -54,8 +54,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -45,8 +45,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipSPARSE.yml
+++ b/.azuredevops/components/hipSPARSE.yml
@@ -52,8 +52,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -105,7 +105,6 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DCMAKE_Fortran_COMPILER=f95
        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -42,8 +42,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/hipfort.yml
+++ b/.azuredevops/components/hipfort.yml
@@ -51,8 +51,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -29,7 +29,7 @@ jobs:
    value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
  - name: HIP_PATH
    value: '$(Agent.BuildDirectory)/rocm'
-  pool: ${{ variables.ULTRA_BUILD_POOL }}
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
  workspace:
    clean: all
  steps:
@@ -51,7 +51,7 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
        -DCMAKE_BUILD_TYPE=Release
-        -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
+        -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir
        -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
        -DCLANG_ENABLE_AMDCLANG=ON
        -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
@@ -85,7 +85,7 @@ jobs:
      componentName: check-llvm
      testDir: 'llvm/build'
      testExecutable: './bin/llvm-lit'
-      testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
+      testParameters: '-q --xunit-xml-output=llvm_test_output.xml ./test'
      testOutputFile: llvm_test_output.xml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
--- a/.azuredevops/components/omnitrace.yml
+++ b/.azuredevops/components/omnitrace.yml
@@ -0,0 +1,140 @@
+# largely referenced from: https://github.com/ROCm/omnitrace/blob/main/.github/workflows/ubuntu-jammy.yml
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - autoconf
+    - autotools-dev
+    - bison
+    - build-essential
+    - bzip2
+    - clang
+    - cmake
+    - environment-modules
+    - g++-12
+    - libdrm-dev
+    - libfabric-dev
+    - libiberty-dev
+    - libpapi-dev
+    - libpfm4-dev
+    - libtool
+    - libopenmpi-dev
+    - m4
+    - openmpi-bin
+    - software-properties-common
+    - python3-pip
+    - texinfo
+    - zlib1g-dev
+- name: pipModules
+  type: object
+  default:
+    - numpy
+    - perfetto
+    - dataclasses
+- name: rocmDependencies
+  type: object
+  default:
+    - aomp
+    - clr
+    - llvm-project
+    - rccl
+    - rocm-core
+    - rocm_smi_lib
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler
+    - rocprofiler-register
+    - roctracer
+
+jobs:
+- job: omnitrace
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  strategy:
+    matrix:
+      gfx942:
+        JOB_GPU_TARGET: gfx942
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      gpuTarget: $(JOB_GPU_TARGET)
+  - task: Bash@3
+    displayName: ROCm symbolic link
+    inputs:
+      targetType: inline
+      script: |
+        sudo rm -rf /opt/rocm
+        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - task: Bash@3
+    displayName: Add ROCm binaries to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+  - task: Bash@3
+    displayName: Add ROCm compilers to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+# build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DOMNITRACE_BUILD_TESTING=ON
+        -DOMNITRACE_BUILD_DYNINST=ON
+        -DOMNITRACE_BUILD_LIBUNWIND=ON
+        -DDYNINST_BUILD_TBB=ON
+        -DDYNINST_BUILD_ELFUTILS=ON
+        -DDYNINST_BUILD_LIBIBERTY=ON
+        -DDYNINST_BUILD_BOOST=ON
+        -DOMNITRACE_USE_PAPI=ON
+        -DOMNITRACE_USE_MPI=ON
+        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
+      multithreadFlag: -- -j32
+  - task: Bash@3
+    displayName: Set up omnitrace env
+    inputs:
+      targetType: inline
+      script: source share/omnitrace/setup-env.sh
+      workingDirectory: build
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: omnitrace
+  - task: Bash@3
+    displayName: Remove ROCm binaries from PATH
+    condition: always()
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - task: Bash@3
+    displayName: Remove ROCm compilers from PATH
+    condition: always()
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/llvm/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -64,8 +64,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -65,8 +65,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -73,8 +73,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - task: Bash@3
    displayName: 'Register libjpeg-turbo packages'
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -60,8 +60,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -75,8 +75,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -55,8 +55,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -47,8 +47,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -42,8 +42,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -48,8 +48,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -45,8 +45,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -58,8 +58,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -56,8 +56,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -47,8 +47,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -8,6 +8,7 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - cmake
    - doxygen
    - doxygen-doc
    - ninja-build
@@ -17,9 +18,7 @@ parameters:
  type: object
  default:
    - cget
-    - cmake==3.20.5
    - ninja
-    - rocm-docs-core

 jobs:
 - job: rocm_cmake
@@ -34,29 +33,21 @@ jobs:
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
-  - task: Bash@3
-    displayName: Add CMake to PATH
-    inputs:
-      targetType: inline
-      script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - task: Bash@3
-    displayName: CTest setup
-    inputs:
-      targetType: inline
-      script: |
-        python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
-        python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
-        git config --global user.email "you@example.com"
-        git config --global user.name "Your Name"
+# extra steps for ctest suite
+  - script: |
+      python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
+      python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
+      git config --global user.email "you@example.com"
+      git config --global user.name "Your Name"
+    displayName: "ctest setup"
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocm-cmake
-      testParameters: '-E "pass-version-parent" -VV --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -65,3 +56,4 @@ jobs:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
      environment: combined
+      gpuTarget: $(JOB_GPU_TARGET)
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -75,8 +75,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -10,7 +10,6 @@ parameters:
  default:
    - cmake
    - libdrm-dev
-    - pkg-config
    - python3-pip

 jobs:
@@ -40,6 +39,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
+      gpuTarget: $(JOB_GPU_TARGET)

 - job: rocm_smi_lib_testing
  dependsOn: rocm_smi_lib
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -59,8 +59,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -72,8 +72,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocr_debug_agent.yml
+++ b/.azuredevops/components/rocr_debug_agent.yml
@@ -69,6 +69,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
+      gpuTarget: $(JOB_GPU_TARGET)

 - job: rocr_debug_agent_testing
  dependsOn: rocr_debug_agent
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -11,7 +11,6 @@ parameters:
    - cmake
    - doxygen
    - graphviz
-    - libdrm-amdgpu-dev
    - ninja-build
    - python3-pip
 - name: pipModules
@@ -50,14 +49,11 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
-      registerROCmPackages: true
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
@@ -89,7 +85,6 @@ jobs:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
      gpuTarget: $(JOB_GPU_TARGET)
-      registerROCmPackages: true

 - job: roctracer_testing
  dependsOn: roctracer
@@ -109,8 +104,6 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-      registerROCmPackages: true
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
    parameters:
@@ -135,4 +128,3 @@ jobs:
      pipModules: ${{ parameters.pipModules }}
      environment: test
      gpuTarget: $(JOB_GPU_TARGET)
-      registerROCmPackages: true
--- a/.azuredevops/components/rpp.yml
+++ b/.azuredevops/components/rpp.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/tag-builds/TransferBench.yml
+++ b/.azuredevops/tag-builds/TransferBench.yml
@@ -1,29 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: checkoutRef
-  type: string
-  default: refs/tags/$(LATEST_RELEASE_TAG)
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-  - repository: release_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/TransferBench
-    ref: ${{ parameters.checkoutRef }}
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/TransferBench.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -222,13 +222,13 @@ parameters:
      hasGpuTarget: false
    rocm-examples:
      pipelineId: $(ROCM_EXAMPLES_PIPELINE_ID)
-      stagingBranch: amd-staging
-      mainlineBranch: amd-mainline
+      stagingBranch: develop
+      mainlineBranch: develop
      hasGpuTarget: true
    rocminfo:
      pipelineId: $(ROCMINFO_PIPELINE_ID)
      stagingBranch: amd-staging
-      mainlineBranch: amd-mainline
+      mainlineBranch: amd-master
      hasGpuTarget: false
    rocMLIR:
      pipelineId: $(ROCMLIR_PIPELINE_ID)
@@ -262,7 +262,7 @@ parameters:
      hasGpuTarget: true
    rocprofiler-compute:
      pipelineId: $(ROCPROFILER_COMPUTE_PIPELINE_ID)
-      stagingBranch: develop
+      stagingBranch: amd-staging
      mainlineBranch: amd-mainline
      hasGpuTarget: true
    rocprofiler-register:
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -33,6 +33,7 @@ parameters:
    - aomp
    - HIPIFY
    - MIVisionX
+    - rocm-cmake
    - rocm_smi_lib
    - rocprofiler-sdk
    - roctracer
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -28,13 +28,13 @@ variables:
 - name: GFX942_TEST_POOL
  value: gfx942_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.3.3
+  value: 6.3.2
 - name: REPO_RADEON_VERSION
-  value: 6.3.3
+  value: 6.3.2
 - name: NEXT_RELEASE_VERSION
  value: 6.4.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.3.3
+  value: rocm-6.3.2
 - name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
  value: 197
 - name: AMDMIGRAPHX_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -156,6 +156,7 @@ HCA
 HGX
 HIPCC
 HIPExtension
+HIPification
 HIPIFY
 HIPification
 HIPify
@@ -241,6 +242,7 @@ Makefile
 Makefiles
 Matplotlib
 Matrox
+MaxText
 Megatrends
 Megatron
 Mellanox
@@ -380,6 +382,7 @@ SIMDs
 SKU
 SKUs
 SLES
+SLURM
 SMEM
 SMI
 SMT
@@ -439,6 +442,7 @@ UMC
 USM
 UTCL
 UTIL
+UltraChat
 Uncached
 Unittests
 Unhandled
@@ -572,6 +576,7 @@ distro
 distros
 dkms
 dtype
+eb
 el
 embeddings
 enablement
@@ -625,6 +630,7 @@ hipify
 hipsolver
 hipsparse
 hlist
+hostname
 hotspotting
 hpc
 hpp
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -66,10 +66,11 @@ project-specific steps. Refer to each repository's PR process for any additional
  during our release cycle, as coordinated by the maintainer
 * We'll inform you once your change is committed

-> [!IMPORTANT]
-> By creating a PR, you agree to allow your contribution to be licensed under the
-> terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
-> licenses.
+:::{important}
+By creating a PR, you agree to allow your contribution to be licensed under the
+terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
+licenses.
+:::

 You can look up each license on the [ROCm licensing](https://rocm.docs.amd.com/en/latest/about/license.html) page.

--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ bash install-prerequisites.sh
 # For ubuntu22.04 system
 cd ROCm/tools/rocm-build/docker/ubuntu22
 cp * /tmp && cd /tmp
-bash install-prerequisites.sh
+bash install-prerequisities.sh
 # For ubuntu24.04 system
 cd ROCm/tools/rocm-build/docker/ubuntu24
 cp * /tmp && cd /tmp
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -406,15 +406,15 @@ issues related to individual components, review the [Detailed component changes]

 ### Zero value is displayed in ROCTx aggregated statistics

-The ROCTx markers are standalone markers within the ROCProfiler-SDK library. Each marker reports only a single timestamp, which is recorded as the `start_timestamp` and `end_timestamp`. As a result, the value for aggregated statistics presented in `TotalDurationNs`, `maxNs`, and `minNs`, is zero. The zero value indicates that the actual execution time is not associated with the markers, which is an expected behavior. See [GitHub issue #4396](https://github.com/ROCm/ROCm/issues/4396).
+The ROCTx markers are standalone markers within the ROCprofiler-SDK library. Each marker reports only a single timestamp, which is recorded as the `start_timestamp` and `end_timestamp`. As a result, the value for aggregated statistics presented in `TotalDurationNs`, `maxNs`, and `minNs`, is zero. The zero value indicates that the actual execution time is not associated with the markers, which is an expected behavior. See [GitHub issue #4396](https://github.com/ROCm/ROCm/issues/4396).

 ## ROCm upcoming changes

 The following changes to the ROCm software stack are anticipated for future releases.

-### ROCTracer and ROCProfiler (rocprof and rocprofv2) deprecation
+### ROCTracer, ROCProfiler, rocprof, and rocprofv2 deprecation

-Development and support for ROCTracer and ROCProfiler (`rocprof` and `rocprofv2`) will phase out in favor of ROCprofiler-SDK (`rocprofv3`) in upcoming ROCm releases. Going forward, only critical defect fixes will be addressed for older versions of profiling tools and libraries. Upgrade to the latest version of ROCprofiler-SDK (`rocprofv3`) library to ensure continued support and access to new features.
+Development and support for ROCTracer, ROCProfiler, `rocprof`, and `rocprofv2` will phase out in favor of ROCprofiler-SDK (`rocprofv3`) in upcoming ROCm releases. Going forward, only critical defect fixes will be addressed for older versions of profiling tools and libraries. Upgrade to the latest version of ROCprofiler-SDK library and `rocprofv3` tool to ensure continued support and access to new features.

 ### AMDGPU wavefront size compiler macro deprecation

--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -25,7 +25,7 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENCE) |
+| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENSE.txt) |
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
@@ -77,7 +77,7 @@ additional licenses. Please review individual repositories for more information.
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
 | [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE) |
 | [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
-| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE) |
+| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
 | [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
 | [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/amd-staging/LICENSE.txt) |
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,120 +1,120 @@
-ROCm Version,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,
-,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      Thrust,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-,,,,,,,,,,,,,
-      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      Tested user space versions,"6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      ,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+      ,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      Thrust,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      Tested user space versions,"6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      ,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -152,7 +152,7 @@ compatibility and system requirements.
 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
 .. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] ROCm provides forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software for +/- 2 releases. These are the compatibility combinations that are currently supported.
+.. [#kfd_support] ROCm provides forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.

 .. _OS-kernel-versions:
@@ -228,5 +228,5 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#kfd_support-past-60] ROCm provides forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software for +/- 2 releases. These are the compatibility combinations that are currently supported.
+   .. [#kfd_support-past-60] ROCm provides forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -811,7 +811,7 @@ authoring, fine-tuning and experimenting with LLMs.
 torchserve
 --------------------------------------------------------------------------------

-The `torchserve <https://pytorch.org/torchserve/>`_ is a PyTorch domain library
+The `torchserve <https://pytorch.org/serve/>`_ is a PyTorch domain library
 for common sparsity and parallelism primitives needed for large-scale recommender
 systems.

--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -72,7 +72,7 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.17-dev/images/sha256-fd2653f436880366cc874aa24264ca9dabd892d76ccb63fb807debba459bcaaf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
@@ -81,7 +81,7 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.17-dev/images/sha256-8a5eb7443798935dd269575e2abae847b702e1dfb06766ab84f081a6314d8b95"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.0 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.17.0-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
@@ -90,7 +90,7 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.12-tf2.16-dev/images/sha256-8fc939b10cdd6d2b11407474880d4c8ab2b52ab6e2d1743c921fc2adbfd0422f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
@@ -99,7 +99,7 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.16-dev/images/sha256-a4cc6ab23d59fdf5459ceac1f0a603e6c16ae7f885d30e42c0c2b3ac60c2ad10"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
@@ -108,7 +108,7 @@ the |docker-icon| icon to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.3.3-py3.10-tf2.15-dev/images/sha256-60887c488421184adcb60b9ed4f72a8bd7bdb64d238e50943ca7cbde38e4aa48"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
+      - `tensorflow-rocm 2.15.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.3/tensorflow_rocm-2.15.1-cp310-cp310-manylinux_2_28_x86_64.whl>`_
      - dev
      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `TensorBoard 2.15.2 <https://github.com/tensorflow/tensorboard/tree/2.15.2>`_
--- a/docs/compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/pytorch-compatibility.rst
@@ -0,0 +1,916 @@
+.. meta::
+    :description: PyTorch compatibility
+    :keywords: GPU, PyTorch compatibility
+
+********************************************************************************
+PyTorch compatibility
+********************************************************************************
+
+`PyTorch <https://pytorch.org/>`_ is an open-source tensor library designed for
+deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
+using `MIOpen <https://github.com/ROCm/MIOpen>`_ and
+`RCCL <https://github.com/ROCm/rccl>`_ libraries.
+
+ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due to independent
+compatibility considerations, this results in two distinct release cycles for PyTorch on ROCm:
+
+- ROCm PyTorch release:
+
+  - Provides the latest version of ROCm but doesn't immediately support the latest stable PyTorch
+    version.
+
+  - Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
+    pre-installed.
+
+  - ROCm PyTorch repository: `<https://github.com/rocm/pytorch>`__
+
+  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` to get started.
+
+- Official PyTorch release:
+
+  - Provides the latest stable version of PyTorch but doesn't immediately support the latest ROCm version.
+
+  - Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
+
+  - See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`_
+    or `Previous versions <https://pytorch.org/get-started/previous-versions/>`_ to get started.
+
+The upstream PyTorch includes an automatic HIPification solution that automatically generates HIP
+source code from the CUDA backend. This approach allows PyTorch to support ROCm without requiring
+manual code modifications.
+
+ROCm's development is aligned with the stable release of PyTorch while upstream PyTorch testing uses
+the stable release of ROCm to maintain consistency.
+
+.. _pytorch-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+AMD validates and publishes ready-made `PyTorch <https://hub.docker.com/r/rocm/pytorch>`_
+images with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories are validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+
+.. list-table:: PyTorch Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - PyTorch
+      - Ubuntu
+      - Python
+      - Apex
+      - torchvision
+      - TensorBoard
+      - MAGMA
+      - UCX
+      - OMPI
+      - OFED
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0/images/sha256-98ddf20333bd01ff749b8092b1190ee369a75d3b8c71c2fac80ffdcb1a98d529?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-3128/>`_
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.4.0/images/sha256-402c9b4f1a6b5a81c634a1932b56cbe01abb699cfcc7463d226276997c6cf8ea?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31016/>`_
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_2.4.0/images/sha256-e0608b55d408c3bfe5c19fdd57a4ced3e0eb3a495b74c309980b60b156c526dd?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 22.04
+      - `3.9 <https://www.python.org/downloads/release/python-3918/>`_
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-652cf25263d05b1de548222970aeb76e60b12de101de66751264709c0d0ff9d8?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31016/>`_
+      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
+      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.10_pytorch_release_2.2.1/images/sha256-051976f26beab8f9aa65d999e3ad546c027b39240a0cc3ee81b114a9024f2912?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31016/>`_
+      - `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
+      - `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_2.2.1/images/sha256-88c839a364d109d3748c100385bfa100d28090d25118cc723fd0406390ab2f7e?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.2.1 <https://github.com/ROCm/pytorch/tree/release/2.2>`_
+      - 20.04
+      - `3.9 <https://www.python.org/downloads/release/python-3921/>`_
+      - `1.2.0 <https://github.com/ROCm/apex/tree/release/1.2.0>`_
+      - `0.17.1 <https://github.com/pytorch/vision/tree/v0.17.1>`_
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu22.04_py3.9_pytorch_release_1.13.1/images/sha256-994424ed07a63113f79dd9aa72159124c00f5fbfe18127151e6658f7d0b6f821?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
+      - 22.04
+      - `3.9 <https://www.python.org/downloads/release/python-3921/>`_
+      - `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
+      - `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.3_ubuntu20.04_py3.9_pytorch_release_1.13.1/images/sha256-7b8139fe40a9aeb4bca3aecd15c22c1fa96e867d93479fa3a24fdeeeeafa1219?context=explore"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `1.13.1 <https://github.com/ROCm/pytorch/tree/release/1.13>`_
+      - 20.04
+      - `3.9 <https://www.python.org/downloads/release/python-3921/>`_
+      - `1.0.0 <https://github.com/ROCm/apex/tree/release/1.0.0>`_
+      - `0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18>`_
+      - `master <https://bitbucket.org/icl/magma/src/master/>`_
+      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
+
+Critical ROCm libraries for PyTorch
+================================================================================
+
+The functionality of PyTorch with ROCm is shaped by its underlying library
+dependencies. These critical ROCm components affect the capabilities,
+performance, and feature set available to developers.
+
+.. list-table::
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+      - Used in
+    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
+      - 1.1.0
+      - Enables faster execution of core operations like matrix multiplication
+        (GEMM), convolutions and transformations.
+      - Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
+        ``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
+        and ``torch.nn.MultiheadAttention``. 
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
+      - 2.3.0
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+      - Supports operations like matrix multiplication, matrix-vector products,
+        and tensor contractions. Utilized in both dense and batched linear
+        algebra operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
+      - 0.10.0
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+      - It accelerates operations like ``torch.matmul``, ``torch.mm``, and the
+        matrix multiplications used in convolutional and linear layers.
+    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
+      - 3.3.0
+      - Provides a C++ template library for parallel algorithms for reduction,
+        scan, sort and select.
+      - Supports operations like ``torch.sum``, ``torch.cumsum``, ``torch.sort``
+        and ``torch.topk``. Operations on sparse tensors or tensors with
+        irregular shapes often involve scanning, sorting, and filtering, which
+        hipCUB handles efficiently.
+    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
+      - 1.0.17
+      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
+      - Used in functions like the ``torch.fft`` module.
+    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
+      - 2.11.0
+      - Provides fast random number generation for GPUs.
+      - The ``torch.rand``, ``torch.randn`` and stochastic layers like 
+        ``torch.nn.Dropout``.
+    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
+      - 2.3.0
+      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
+        singular value decompositions (SVD).
+      - Supports functions like ``torch.linalg.solve``,
+        ``torch.linalg.eig``, and ``torch.linalg.svd``.
+    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
+      - 3.1.2
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+      - Sparse tensor operations ``torch.sparse``.
+    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
+      - 0.2.2
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+      - Sparse tensor operations ``torch.sparse``.
+    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
+      - 1.4.0
+      - Optimizes for high-performance tensor operations, such as contractions.
+      - Accelerates tensor algebra, especially in deep learning and scientific
+        computing.
+    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
+      - 3.3.0
+      - Optimizes deep learning primitives such as convolutions, pooling,
+        normalization, and activation functions.
+      - Speeds up convolutional neural networks (CNNs), recurrent neural
+        networks (RNNs), and other layers. Used in operations like
+        ``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
+    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
+      - 2.11.0
+      - Add graph-level optimizations, ONNX models and mixed precision support
+        and enable Ahead-of-Time (AOT) Compilation.
+      - Speeds up inference models and executes ONNX models for
+        compatibility with other frameworks.
+        ``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
+    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
+      - 3.1.0
+      - Optimizes acceleration for computer vision and AI workloads like
+        preprocessing, augmentation, and inferencing.
+      - Faster data preprocessing and augmentation pipelines for datasets like
+        ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
+        and ``torchvision`` workflows.
+    * - `rocAL <https://github.com/ROCm/rocAL>`_
+      - 2.1.0
+      - Accelerates the data pipeline by offloading intensive preprocessing and
+        augmentation tasks. rocAL is part of MIVisionX.
+      - Easy to integrate into PyTorch's ``torch.utils.data`` and
+        ``torchvision`` data load workloads.
+    * - `RCCL <https://github.com/ROCm/rccl>`_
+      - 2.21.5
+      - Optimizes for multi-GPU communication for operations like AllReduce and
+        Broadcast.
+      - Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
+        Handles communication in multi-GPU setups.
+    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
+      - 0.8.0
+      - Provide hardware-accelerated data decoding capabilities, particularly
+        for image, video, and other dataset formats.
+      - Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
+        and ``torch.distributed``.
+    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
+      - 0.6.0
+      - Provide hardware-accelerated JPEG image decoding and encoding.
+      - GPU accelerated ``torchvision.io.decode_jpeg`` and
+        ``torchvision.io.encode_jpeg`` and can be integrated in
+        ``torch.utils.data`` and ``torchvision``.
+    * - `RPP <https://github.com/ROCm/RPP>`_
+      - 1.9.1
+      - Speed up data augmentation, transformation, and other preprocessing step.
+      - Easy to integrate into PyTorch's ``torch.utils.data`` and
+        ``torchvision`` data load workloads.
+    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
+      - 3.3.0
+      - Provides a C++ template library for parallel algorithms like sorting,
+        reduction, and scanning.
+      - Utilized in backend operations for tensor computations requiring
+        parallel processing.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
+      - 1.6.0
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+      - Linear layers (``torch.nn.Linear``), convolutional layers
+        (``torch.nn.Conv2d``), attention layers, general tensor operations that
+        involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
+        more.
+
+Supported and unsupported features
+================================================================================
+
+The following section maps GPU-accelerated PyTorch features to their supported
+ROCm and PyTorch versions.
+
+torch
+--------------------------------------------------------------------------------
+
+`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
+PyTorch, providing data structures for multi-dimensional tensors and
+implementing mathematical operations on them. It also includes utilities for
+efficient serialization of tensors and arbitrary data types, along with various
+other tools.
+
+Tensor data types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The data type of a tensor is specified using the ``dtype`` attribute or argument, and PyTorch supports a wide range of data types for different use cases.
+
+The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_'s single data types:
+
+.. list-table::
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - ``torch.float8_e4m3fn``
+      - 8-bit floating point, e4m3
+      - 2.3
+      - 5.5
+    * - ``torch.float8_e5m2``
+      - 8-bit floating point, e5m2
+      - 2.3
+      - 5.5
+    * - ``torch.float16`` or ``torch.half``
+      - 16-bit floating point
+      - 0.1.6
+      - 2.0
+    * - ``torch.bfloat16``
+      - 16-bit floating point
+      - 1.6
+      - 2.6
+    * - ``torch.float32`` or ``torch.float``
+      - 32-bit floating point
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.float64`` or ``torch.double``
+      - 64-bit floating point
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.complex32`` or ``torch.chalf``
+      - PyTorch provides native support for 32-bit complex numbers
+      - 1.6
+      - 2.0
+    * - ``torch.complex64`` or ``torch.cfloat``
+      - PyTorch provides native support for 64-bit complex numbers
+      - 1.6
+      - 2.0
+    * - ``torch.complex128`` or ``torch.cdouble``
+      - PyTorch provides native support for 128-bit complex numbers
+      - 1.6
+      - 2.0
+    * - ``torch.uint8``
+      - 8-bit integer (unsigned)
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.uint16``
+      - 16-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
+    * - ``torch.uint32``
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
+    * - ``torch.uint64``
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
+    * - ``torch.int8``
+      - 8-bit integer (signed)
+      - 1.12
+      - 5.0
+    * - ``torch.int16`` or ``torch.short``
+      - 16-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.int32`` or ``torch.int``
+      - 32-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.int64`` or ``torch.long``
+      - 64-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
+    * - ``torch.bool``
+      - Boolean
+      - 1.2
+      - 2.0
+    * - ``torch.quint8``
+      - Quantized 8-bit integer (unsigned)
+      - 1.8
+      - 5.0
+    * - ``torch.qint8``
+      - Quantized 8-bit integer (signed)
+      - 1.8
+      - 5.0
+    * - ``torch.qint32``
+      - Quantized 32-bit integer (signed)
+      - 1.8
+      - 5.0
+    * - ``torch.quint4x2``
+      - Quantized 4-bit integer (unsigned)
+      - 1.8
+      - 5.0
+
+.. note::
+
+  Unsigned types aside from ``uint8`` are currently only have limited support in
+  eager mode (they primarily exist to assist usage with ``torch.compile``).
+
+  The :doc:`ROCm precision support page <rocm:reference/precision-support>`
+  collected the native HW support of different data types.
+
+torch.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``torch.cuda`` in PyTorch is a module that provides utilities and functions for
+managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
+computations, memory management, and efficient execution of tensor operations,
+leveraging ROCm and CUDA as the underlying frameworks.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - Device management
+      - Utilities for managing and interacting with GPUs.
+      - 0.4.0
+      - 3.8
+    * - Tensor operations on GPU
+      - Perform tensor operations such as addition and matrix multiplications on
+        the GPU.
+      - 0.4.0
+      - 3.8
+    * - Streams and events
+      - Streams allow overlapping computation and communication for optimized
+        performance, events enable synchronization.
+      - 1.6.0
+      - 3.8
+    * - Memory management
+      - Functions to manage and inspect memory usage like
+        ``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
+        ``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
+      - 0.3.0
+      - 1.9.2
+    * - Running process lists of memory management
+      - Return a human-readable printout of the running processes and their GPU
+        memory use for a given device with functions like 
+        ``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
+      - 1.8.0
+      - 4.0
+    * - Communication collectives
+      - A set of APIs that enable efficient communication between multiple GPUs,
+        allowing for distributed computing and data parallelism.
+      - 1.9.0
+      - 5.0
+    * - ``torch.cuda.CUDAGraph``
+      - Graphs capture sequences of GPU operations to minimize kernel launch
+        overhead and improve performance.
+      - 1.10.0
+      - 5.3
+    * - TunableOp
+      - A mechanism that allows certain operations to be more flexible and
+        optimized for performance. It enables automatic tuning of kernel
+        configurations and other settings to achieve the best possible
+        performance based on the specific hardware (GPU) and workload.
+      - 2.0
+      - 5.4
+    * - NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.8.0
+      - ❌
+    * - Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.13.0
+      - ❌
+    * - Jiterator (beta)
+      - Jiterator allows asynchronous data streaming into computation streams
+        during training loops.
+      - 1.13.0
+      - 5.2
+
+.. Need to validate and extend.
+
+torch.backends.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``torch.backends.cuda`` is a PyTorch module that provides configuration options
+and flags to control the behavior of CUDA or ROCm operations. It is part of the
+PyTorch backend configuration system, which allows users to fine-tune how
+PyTorch interacts with the CUDA or ROCm environment.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - ``cufft_plan_cache``
+      - Manages caching of GPU FFT plans to optimize repeated FFT computations.
+      - 1.7.0
+      - 5.0
+    * - ``matmul.allow_tf32``
+      - Enables or disables the use of TensorFloat-32 (TF32) precision for
+        faster matrix multiplications on GPUs with Tensor Cores.
+      - 1.10.0
+      - ❌
+    * - ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions (e.g., with fp16 accumulation type) are
+        allowed with fp16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+      - ❌
+    * - ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+      - ❌
+
+.. Need to validate and extend.
+
+torch.backends.cudnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Supported ``torch`` options:
+
+.. list-table::
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - ``allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
+        Ampere or newer GPUs.
+      - 1.12.0
+      - ❌
+    * - ``deterministic``
+      - A bool that, if True, causes cuDNN to only use deterministic
+        convolution algorithms.
+      - 1.12.0
+      - 6.0
+
+Automatic mixed precision: torch.amp
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch that automates the process of using both 16-bit (half-precision,
+float16) and 32-bit (single-precision, float32) floating-point types in model
+training and inference.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - Autocasting
+      - Instances of autocast serve as context managers or decorators that allow
+        regions of your script to run in mixed precision.
+      - 1.9
+      - 2.5
+    * - Gradient scaling
+      - To prevent underflow, “gradient scaling” multiplies the network’s
+        loss(es) by a scale factor and invokes a backward pass on the scaled
+        loss(es). Gradients flowing backward through the network are then
+        scaled by the same factor. In other words, gradient values have a
+        larger magnitude, so they don’t flush to zero.
+      - 1.9
+      - 2.5
+    * - CUDA op-specific behavior
+      - These ops always go through autocasting whether they are invoked as part
+        of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
+        functions are exposed in multiple namespaces, they go through
+        autocasting regardless of the namespace.
+      - 1.9
+      - 2.5
+
+Distributed library features
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PyTorch distributed library includes a collective of parallelism modules, a
+communications layer, and infrastructure for launching and debugging large
+training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
+
+The Distributed Library feature in PyTorch provides tools and APIs for building
+and running distributed machine learning workflows. It allows training models
+across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
+of computational resources and scalability for large-scale tasks.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Features
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - TensorPipe
+      - TensorPipe is a point-to-point communication library integrated into
+        PyTorch for distributed training. It is designed to handle tensor data
+        transfers efficiently between different processes or devices, including
+        those on separate machines.
+      - 1.8
+      - 5.4
+    * - Gloo
+      - Gloo is designed for multi-machine and multi-GPU setups, enabling
+        efficient communication and synchronization between processes. Gloo is
+        one of the default backends for PyTorch's Distributed Data Parallel
+        (DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
+      - 1.0
+      - 2.0
+
+torch.compiler
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+    :header-rows: 1
+
+    * - Features
+      - Description
+      - Since PyTorch
+      - Since ROCm
+    * - ``torch.compiler`` (AOT Autograd)
+      - Autograd captures not only the user-level code, but also backpropagation,
+        which results in capturing the backwards pass “ahead-of-time”. This
+        enables acceleration of both forwards and backwards pass using
+        ``TorchInductor``.
+      - 2.0
+      - 5.3
+    * - ``torch.compiler`` (TorchInductor)
+      - The default ``torch.compile`` deep learning compiler that generates fast
+        code for multiple accelerators and backends. You need to use a backend
+        compiler to make speedups through ``torch.compile`` possible. For AMD,
+        NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
+      - 2.0
+      - 5.3
+
+torchaudio
+--------------------------------------------------------------------------------
+
+The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
+utilities for processing audio data in PyTorch, such as audio loading,
+transformations, and feature extraction.
+
+To ensure GPU-acceleration with ``torchaudio.transforms``, you need to move audio
+data (waveform tensor) explicitly to GPU using ``.to('cuda')``.
+
+The following ``torchaudio`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Features
+      - Description
+      - Since torchaudio version
+      - Since ROCm
+    * - ``torchaudio.transforms.Spectrogram``
+      - Generate spectrogram of an input waveform using STFT.
+      - 0.6.0
+      - 4.5
+    * - ``torchaudio.transforms.MelSpectrogram``
+      - Generate the mel-scale spectrogram of raw audio signals.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.MFCC``
+      - Extract of MFCC features.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.Resample``
+      - Resample a signal from one frequency to another
+      - 0.9.0
+      - 4.5
+
+torchvision
+--------------------------------------------------------------------------------
+
+The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
+provide datasets, model architectures, and common image transformations for
+computer vision.
+
+The following ``torchvision`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Features
+      - Description
+      - Since torchvision version
+      - Since ROCm
+    * - ``torchvision.transforms.functional``
+      - Provides GPU-compatible transformations for image preprocessing like
+        resize, normalize, rotate and crop.
+      - 0.2.0
+      - 4.0
+    * - ``torchvision.ops``
+      - GPU-accelerated operations for object detection and segmentation tasks.
+        ``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
+        ``box_convert``.
+      - 0.6.0
+      - 3.3
+    * - ``torchvision.models`` with ``.to('cuda')``
+      - ``torchvision`` provides several pre-trained models (ResNet, Faster
+        R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
+        training.
+      - 0.1.6
+      - 2.x
+    * - ``torchvision.io``
+      - Video decoding and frame extraction using GPU acceleration with NVIDIA’s
+        NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
+      - 0.4.0
+      - 6.3
+
+torchtext
+--------------------------------------------------------------------------------
+
+The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
+utilities for processing and working with text data in PyTorch, including
+tokenization, vocabulary management, and text embeddings. torchtext supports
+preprocessing pipelines and integration with PyTorch models, simplifying the
+implementation of natural language processing (NLP) tasks.
+
+To leverage GPU acceleration in torchtext, you need to move tensors
+explicitly to the GPU using ``.to('cuda')``.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+
+* Only official release exists.
+
+torchtune
+--------------------------------------------------------------------------------
+
+The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
+authoring, fine-tuning and experimenting with LLMs.
+
+* Usage: It works out-of-the-box, enabling developers to fine-tune ROCm PyTorch solutions.
+
+* Only official release exists.
+
+torchserve
+--------------------------------------------------------------------------------
+
+The `torchserve <https://pytorch.org/torchserve/>`_ is a PyTorch domain library
+for common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+
+* Only official release exists.
+
+torchrec
+--------------------------------------------------------------------------------
+
+The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
+common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchrec does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+
+* Only official release exists.
+
+Unsupported PyTorch features
+----------------------------
+
+The following are GPU-accelerated PyTorch features not currently supported by ROCm.
+
+.. list-table::
+    :widths: 30, 60, 10
+    :header-rows: 1
+
+    * - Data type
+      - Description
+      - Since PyTorch
+    * - APEX batch norm
+      - Use APEX batch norm instead of PyTorch batch norm.
+      - 1.6.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_tf32``
+      - A bool that controls whether TensorFloat-32 tensor cores may be used in
+        matrix multiplications.
+      - 1.7
+    * - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.7.0
+    * - ``torch.cuda`` / Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.8.0
+    * - ``torch-tensorrt``
+      - Integrate TensorRT library for optimizing and deploying PyTorch models.
+        ROCm does not have equialent library for TensorRT.
+      - 1.9.0
+    * - ``torch.backends`` / ``cudnn.allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions.
+      - 1.10.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions with fp16 accumulation type are
+        allowed with fp16 GEMMs.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+    * - ``torch.nn.functional`` / ``scaled_dot_product_attention`` 
+      - Flash attention backend for SDPA to accelerate attention computation in
+        transformer-based models.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+    * - Dynamic parallelism
+      - PyTorch itself does not directly expose dynamic parallelism as a core
+        feature. Dynamic parallelism allow GPU threads to launch additional
+        threads which can be reached using custom operations via the
+        ``torch.utils.cpp_extension`` module.
+      - Not a core feature
+    * - Unified memory support in PyTorch
+      - Unified Memory is not directly exposed in PyTorch's core API, it can be
+        utilized effectively through custom CUDA extensions or advanced
+        workflows.
+      - Not a core feature
+
+Use cases and recommendations
+================================================================================
+
+* :doc:`Using ROCm for AI: training a model </how-to/rocm-for-ai/train-a-model>` provides
+  guidance on how to leverage the ROCm platform for training AI models. It covers the steps, tools, and best practices
+  for optimizing training workflows on AMD GPUs using PyTorch features.
+
+* :doc:`Single-GPU fine-tuning and inference </how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference>`
+  describes and demonstrates how to use the ROCm platform for the fine-tuning and inference of
+  machine learning models, particularly large language models (LLMs), on systems with a single AMD
+  Instinct MI300X accelerator. This page provides a detailed guide for setting up, optimizing, and
+  executing fine-tuning and inference workflows in such environments.
+
+* :doc:`Multi-GPU fine-tuning and inference optimization </how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference>`
+  describes and demonstrates the fine-tuning and inference of machine learning models on systems
+  with multi MI300X accelerators.
+
+* The :doc:`Instinct MI300X workload optimization guide </how-to/tuning-guides/mi300x/workload>` provides detailed
+  guidance on optimizing workloads for the AMD Instinct MI300X accelerator using ROCm. This guide is aimed at helping
+  users achieve optimal performance for deep learning and other high-performance computing tasks on the MI300X
+  accelerator.
+
+* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
+  describes how PyTorch integrates with ROCm for AI workloads It outlines the use of PyTorch on the ROCm platform and
+  focuses on how to efficiently leverage AMD GPU hardware for training and inference tasks in AI applications.
+
+For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`_
--- a/docs/conceptual/iommu.rst
+++ b/docs/conceptual/iommu.rst
@@ -1,63 +0,0 @@
-.. meta::
-   :description: Input-Output Memory Management Unit (IOMMU)
-   :keywords: IOMMU, DMA, PCIe, xGMI, AMD, ROCm
-
-****************************************************************
-Input-Output Memory Management Unit (IOMMU)
-****************************************************************
-
-The I/O Memory Management Unit (IOMMU) provides memory remapping services for I/O devices. It adds support for address translation and system memory access protection on direct memory access (DMA) transfers from peripheral devices. 
-
-The IOMMU's memory remapping services:
-
-* provide private I/O space for devices used in a guest virtual machine.
-* prevent unauthorized DMA requests to system memory and to memory-mapped I/O (MMIO).
-* help in debugging memory access issues.
-* facilitate peer-to-peer DMA.
-
-The IOMMU also provides interrupt remapping, which is used by devices that support multiple interrupts and for interrupt delivery on hardware platforms with a large number of cores.
-
-.. note::
-
-  AMD Instinct accelerators are connected via XGMI links and don't use PCI/PCIe for peer-to-peer DMA. Because PCI/PCIe is not used for peer-to-peer DMA, there are no device physical addressing limitations or platform root port limitations. However, because non-GPU devices such as RDMA NICs use PCIe for peer-to-peer DMA, there might still be physical addressing and platform root port limitations when these non-GPU devices interact with other devices, including GPUs.
-
-Linux supports IOMMU in both virtualized environments and bare metal. 
-
-The IOMMU is enabled by default but can be disabled or put into passthrough mode through the Linux kernel command line:
-
-.. list-table:: 
-  :widths: 15 15 70
-  :header-rows: 1
-
-  * - IOMMU Mode
-    - Kernel command
-    - Description
-  * - Enabled
-    - Default setting
-    - Recommended for AMD Radeon GPUs that need peer-to-peer DMA.
-    
-      The IOMMU is enabled in remapping mode. Each device gets its own I/O virtual address space. All devices on Linux register their DMA addressing capabilities, and the kernel will ensure that any address space mapped for DMA is mapped within the device's DMA addressing limits. Only address space explicitly mapped by the devices will be mapped into virtual address space. Attempts to access an unmapped page will generate an IOMMU page fault. 
-  * - Passthrough
-    - ``iommu=pt``
-    - Recommended for AMD Instinct Accelerators and for AMD Radeon GPUs that don't need peer-to-peer DMA.
-
-      Interrupt remapping is enabled but I/O remapping is disabled. The entire platform shares a common platform address space for system memory and MMIO spaces, ensuring compatibility with drivers from external vendors, while still supporting CPUs with a large number of cores. 
-  * - Disabled
-    - ``iommu=off``
-    - Not recommended.
-      
-      The IOMMU is disabled and the entire platform shares a common platform address space for system memory and MMIO spaces.
-      
-      This mode should only be used with older Linux distributions with kernels that are not configured to support peer-to-peer DMA with an IOMMU. In these cases, the IOMMU needs to be disabled to use peer-to-peer DMA. 
-    
-The IOMMU also provides virtualized access to the MMIO portions of the platform address space for peer-to-peer DMA.
-
-Because peer-to-peer DMA is not officially part of the PCI/PCIe specification, the behavior of peer-to-peer DMA varies between hardware platforms. 
-
-AMD CPUs earlier than AMD Zen only supported peer-to-peer DMA for writes. On CPUs from AMD Zen and later, peer-to-peer DMA is fully supported. 
-
-To use peer-to-peer DMA on Linux, enable the following options in your Linux kernel configuration:
-
-* ``CONFIG_PCI_P2PDMA``
-* ``CONFIG_DMABUF_MOVE_NOTIFY`` 
-* ``CONFIG_HSA_AMD_P2P``
--- a/docs/conceptual/oversubscription.rst
+++ b/docs/conceptual/oversubscription.rst
@@ -1,34 +0,0 @@
-.. meta::
-   :description: Learn what causes oversubscription.
-   :keywords: warning, log, gpu, performance penalty, help
-
-*******************************************************************
-Oversubscription of hardware resources in AMD Instinct accelerators
-*******************************************************************
-
-When an AMD Instinct™ MI series accelerator enters an oversubscribed state, the ``amdgpu`` driver outputs the following
-message.
-
-``amdgpu: Runlist is getting oversubscribed. Expect reduced ROCm performance.``
-
-Oversubscription occurs when application demands exceed the available hardware resources. In an oversubscribed
-state, the hardware scheduler tries to manage resource usage in a round-robin fashion. However,
-this can result in reduced performance, as resources might be occupied by applications or queues not actively
-submitting work. The granularity of hardware resources occupied by an inactive queue can be in the order of
-milliseconds, during which the accelerator or GPU is effectively blocked and unable to process work submitted by other
-queues.
-
-What triggers oversubscription?
-===============================
-
-The system enters an oversubscribed state when one of the following conditions is met:
-
-* **Hardware queue limit exceeded**: The number of user-mode compute queues requested by applications exceeds the
-  hardware limit of 24 queues for current Instinct accelerators.
-
-* **Virtual memory context slots exceeded**: The number of user processes exceeds the number of available virtual memory
-  context slots, which is 11 for current Instinct accelerators.
-
-* **Multiple processes using cooperative workgroups**: More than one process attempts to use the cooperative workgroup
-  feature, leading to resource contention.
-
--- a/docs/conceptual/pcie-atomics.rst
+++ b/docs/conceptual/pcie-atomics.rst
@@ -1,57 +0,0 @@
-.. meta::
-   :description: How ROCm uses PCIe atomics
-   :keywords: PCIe, PCIe atomics, atomics, Atomic operations, AMD, ROCm
-
-*****************************************************************************
-How ROCm uses PCIe atomics
-*****************************************************************************
-AMD ROCm is an extension of the Heterogeneous System Architecture (HSA). To meet the requirements of an HSA-compliant system, ROCm supports queuing models, memory models, and signaling and synchronization protocols. ROCm can perform atomic Read-Modify-Write (RMW) transactions that extend inter-processor synchronization mechanisms to Input/Output (I/O) devices starting from Peripheral Component Interconnect Express 3.0 (PCIe™ 3.0). It supports the defined HSA capabilities for queuing and signaling memory operations. To learn more about the requirements of an HSA-compliant system, see the 
-`HSA Platform System Architecture Specification <http://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf>`_.
-
-ROCm uses platform atomics to perform memory operations like queuing, signaling, and synchronization across multiple CPU, GPU agents, and I/O devices. Platform atomics ensure that atomic operations run synchronously, without interruptions or conflicts, across multiple shared resources.
-
-Platform atomics in ROCm
-==============================
-Platform atomics enable the set of atomic operations that perform RMW actions across multiple processors, devices, and memory locations so that they run synchronously without interruption. An atomic operation is a sequence of computing instructions run as a single, indivisible unit. These instructions are completed in their entirety without any interruptions. If the instructions can't be completed as a unit without interruption, none of the instructions are run. These operations support 32-bit and 64-bit address formats.
-
-Some of the operations for which ROCm uses platform atomics are:
-
-* Update the HSA queue's ``read_dispatch_id``. The command processor on the GPU agent uses a 64-bit atomic add operation. It updates the packet ID it processed.
-* Update the HSA queue's ``write_dispatch_id``. The CPU and GPU agents use a 64-bit atomic add operation. It supports multi-writer queue insertions.
-* Update HSA Signals. A 64-bit atomic operation is used for CPU & GPU synchronization.
-
-
-PCIe for atomic operations
----------------------------
-ROCm requires CPUs that support PCIe atomics. Similarly, all connected I/O devices should also support PCIe atomics for optimum compatibility. PCIe supports the ``CAS`` (Compare and Swap), ``FetchADD``, and ``SWAP`` atomic operations across multiple resources. These atomic operations are initiated by the I/O devices that support 32-bit, 64-bit, and 128-bit operands. Likewise, the target memory address where these atomic operations are performed should also be aligned to the size of the operand. This alignment ensures that the operations are performed efficiently and correctly without failure. 
-
-When an atomic operation is successful, the requester receives a response of completion along with the operation result. However, any errors associated with the operation are signaled to the requester by updating the Completion Status field. Issues accessing the target location or running the atomic operation are common errors. Depending upon the error, the Completion Status field is updated to Completer Abort (CA) or Unsupported Request (UR). The field is present in the Completion Descriptor.
-
-To learn more about the industry standards and specifications of PCIe, see `PCI-SIG Specification <https://pcisig.com/specifications>`_.
-
-To learn more about PCIe and its capabilities, consult the following white papers:
-
-* `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
-* `PCI Express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
-* `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
-* `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
-
-Working with PCIe 3.0 in ROCm
-------------------------------
-Starting with PCIe 3.0, atomic operations can be requested, routed through, and completed by PCIe components. Routing and completion do not require software support. Component support for each can be identified by the Device Capabilities 2 (DevCap2) register. Upstream
-bridges need to have atomic operations routing enabled. If not enabled, the atomic operations will fail even if the 
-PCIe endpoint and PCIe I/O devices can perform atomic operations. 
-
-If your system uses PCIe switches to connect and enable communication between multiple PCIe components, the switches must also support atomic operations routing.
-
-To enable atomic operations routing between multiple root ports, each root port must support atomic operation routing. This capability can be identified from the atomic operations routing support bit in the DevCap2 register. If the bit has value of 1, routing is supported. Atomic operation requests are permitted only if a component's ``DEVCTL2.ATOMICOP_REQUESTER_ENABLE``
-field is set. These requests can only be serviced if the upstream components also support atomic operation completion or if the requests can be routed to a component that supports atomic operation completion.
-
-ROCm uses the PCIe-ID-based ordering technology for peer-to-peer (P2P) data transmission. PCIe-ID-based ordering technology is used when the GPU initiates multiple write operations to different memory locations.
-
-For more information on changes implemented in PCIe 3.0, see `Overview of Changes to PCI Express 3.0 <https://www.mindshare.com/files/resources/PCIe%203-0.pdf>`_.
-
-
-
-
-
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,11 +1,12 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
      rocm_version: 6.3.1
-      vllm_version: 0.6.6
-      pytorch_version: 2.7.0 (2.7.0a0+git3a58512)
+      vllm_version: 0.7.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
  model_groups:
    - group: Llama
      tag: llama
@@ -40,6 +41,11 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-2-70b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
      - model: Llama 3.1 70B FP8
        mad_tag: pyt_vllm_llama-3.1-70b_fp8
        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
--- a/docs/how-to/Bar-Memory.rst
+++ b/docs/how-to/Bar-Memory.rst
@@ -23,7 +23,7 @@ There are two ways to handle this:

 * Ensure that the high MMIO aperture is within the physical addressing limits of the devices in the system. For example, if the devices have a 44-bit physical addressing limit, set the ``MMIO High Base`` and ``MMIO High size`` options in the BIOS such that the aperture is within the 44-bit address range, and ensure that the ``Above 4G Decoding`` option is Enabled.  

-* Enable the Input-Output Memory Management Unit (IOMMU). When the IOMMU is enabled in non-passthrough mode, it will create a virtual I/O address space for each device on the system. It also ensures that all virtual addresses created in that space are within the physical addressing limits of the device. For more information on IOMMU, see :doc:`../conceptual/iommu`. 
+* Enable the Input-Output Memory Management Unit (IOMMU). When the IOMMU is enabled in non-passthrough mode, it will create a virtual I/O address space for each device on the system. It also ensures that all virtual addresses created in that space are within the physical addressing limits of the device. For more information on IOMMU, see `Input-Output Memory Management Unit (IOMMU) <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/conceptual/iommu.html>`_. 

 .. _bar-configuration:

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -0,0 +1,27 @@
+.. meta::
+   :description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
+   :keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning
+
+**************************************
+AMD Instinct MI300X performance guides
+**************************************
+
+The following performance guides provide essential guidance on the necessary
+steps to properly :doc:`configure your system for AMD Instinct™ MI300X
+accelerators <../system-optimization/mi300x>`. They include detailed
+instructions on system settings and application :doc:`workload tuning
+<../rocm-for-ai/inference-optimization/workload>` to help you
+leverage the maximum capabilities of these accelerators and achieve superior
+performance.
+
+* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
+  covers essential system settings and system management practices to configure
+  your AMD Instinct MI300X system for performance.
+
+* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
+  optimize the performance of AMD Instinct MI300X series accelerators for HPC
+  and deep learning operations.
+
+* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
+  environment for LLM inference, designed to help you test performance with
+  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
@@ -284,7 +284,7 @@ Installing FBGEMM_GPU consists of the following steps:
 .. note::

   FBGEMM_GPU doesn't require the installation of FBGEMM. To optionally install
-   FBGEMM, see the `FBGEMM install instructions <https://pytorch.org/FBGEMM/fbgemm-development/BuildInstructions.html>`_.
+   FBGEMM, see the `FBGEMM install instructions <https://pytorch.org/FBGEMM/fbgemm/development/BuildInstructions.html>`_.

 Set up the Miniconda environment
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -564,7 +564,7 @@ vLLM engine arguments
 ---------------------

 The following are configuration suggestions to potentially improve performance with vLLM. See
-`vLLM's engine arguments documentation <https://docs.vllm.ai/en/stable/models/engine_args.html>`_
+`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
 for a full list of configurable engine arguments.

 Configure the max-num-seqs parameter
@@ -945,9 +945,9 @@ for details.

  .. code-block:: shell

-     export HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r \
-     f16_r --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
-     --initialization trig_float  --cold_iters 100 -i 1000 --rotating 256
+     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
+     --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
+     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256

 * Example 2: Benchmark forward epilogues and backward epilogues

@@ -1705,12 +1705,12 @@ efficiency and throughput of various computational kernels.

   Occupancy related to VGPRs usage on an Instinct MI300X accelerator

-For example, according to the table, the available VGPR is 512 per Execution
-Unit (EU), and VGPU is allocated at the unit of 16. If the current VGPR usage
-is 170, the actual requested VGPR will be 176, so the occupancy is only 2
-waves per EU since :math:`176 \times 3 > 512`. So, if you set
-``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
-that it might fit 3 waves per EU.
+For example, according to the table, each Execution Unit (EU) has 512 available
+VGPRs, which are allocated in blocks of 16. If the current VGPR usage is 170,
+it will be rounded up to 176 due to the allocation granularity. In this case,
+the occupancy is limited to 2 waves per EU because :math:`176 \times 3 > 512`.
+So, if you set ``waves_per_eu`` to 3, the LLVM backend will attempt to reduce
+VGPR usage so that it might fit 3 waves per EU.

 ``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
   Tile sizes to be tuned to balance the memory-to-computation ratio. The goal
--- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
@@ -47,7 +47,7 @@ Validating vLLM performance
 ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM 
 on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
 format. For more information, see the guide to 
-`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
+`LLM inference performance testing with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
 on the ROCm GitHub repository.

 .. _rocm-for-ai-serve-hugging-face-tgi:
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -20,6 +20,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`Performance validation <vllm-benchmark>`
+- :doc:`Performance testing <vllm-benchmark>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/install.rst
+++ b/docs/how-to/rocm-for-ai/inference/install.rst
@@ -28,7 +28,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
+* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`

 * :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`.

--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -140,8 +140,8 @@ Installing vLLM
   See :ref:`mi300x-vllm-optimization` for performance optimization tips.

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
-   on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in CSV
-   format. For more information, see :doc:`vllm-benchmark`.
+   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
+   For more information, see :doc:`vllm-benchmark`.

 .. _fine-tuning-llms-tgi:

--- a/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
+++ b/docs/how-to/rocm-for-ai/inference/vllm-benchmark.rst
@@ -3,9 +3,9 @@
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

-***********************************************************
-LLM inference performance validation on AMD Instinct MI300X
-***********************************************************
+********************************************************
+LLM inference performance testing on AMD Instinct MI300X
+********************************************************

 .. _vllm-benchmark-unified-docker:

@@ -16,9 +16,9 @@ LLM inference performance validation on AMD Instinct MI300X

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
-   accelerator and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -26,9 +26,11 @@ LLM inference performance validation on AMD Instinct MI300X

   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_

-   With this Docker image, you can quickly validate the expected inference
-   performance numbers for the MI300X accelerator. This topic also provides tips on
-   optimizing performance with popular AI models.
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models:

@@ -79,7 +81,6 @@ LLM inference performance validation on AMD Instinct MI300X
      {% endfor %}
   {% endfor %}

-
   .. note::

      vLLM is a toolkit and library for LLM inference and serving. AMD implements
@@ -87,6 +88,29 @@ LLM inference performance validation on AMD Instinct MI300X
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.

+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/8ad1c446b31c9a944d57215ec081ea781b18a0e5/docs/dev-docker>`__.
+
   Getting started
   ===============

@@ -162,13 +186,13 @@ LLM inference performance validation on AMD Instinct MI300X
         .. tab-item:: Standalone benchmarking

            Run the vLLM benchmark tool independently by starting the
-            `Docker container <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
            as shown in the following snippet.

            .. code-block::

-               docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}

            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.
@@ -280,7 +304,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`../../system-optimization/mi300x`.
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <hugging-face-models>`.
@@ -290,3 +314,40 @@ Further reading

 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+
+Previous versions
+=================
+
+This table lists previous versions of the ROCm vLLM inference Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - ROCm version
+     - vLLM version
+     - PyTorch version
+     - Resources
+
+   * - 6.3.1
+     - 0.6.6
+     - 2.7.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
+
+   * - 6.2.1
+     - 0.6.4
+     - 2.5.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
+
+   * - 6.2.0
+     - 0.4.3
+     - 2.4.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -0,0 +1,345 @@
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.4``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.0                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.31                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+f81a3eb            |
+--------------------------+--------------------------------+
+| hipBLASLt                | git78ec8622                    |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change `localhost` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker:
+
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.4
+
+2. Run the Docker container.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
+
+.. _amd-maxtext-get-started:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_70b.sh
+
+* Example 5: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -1,5 +1,3 @@
-:orphan:
-
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
@@ -16,7 +14,7 @@ It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
 DeepSeek, enabling developers to train next-generation AI models more
 efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

-AMD provides a ready-to-use Docker image for MI300X accelerators containing
+AMD provides a ready-to-use Docker image for MI300X series accelerators containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
@@ -64,60 +62,62 @@ Megatron-LM provides the following key features to train large language models e

 .. _amd-megatron-lm-model-support:

-The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

 * Llama 3.1 8B

 * Llama 3.1 70B

+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
 * DeepSeek-V2-Lite

 .. note::

-   Some models, such as Llama 3, require an external license agreement through
+   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).

+.. _amd-megatron-lm-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
 System validation
 =================

-If you have already validated your system settings, skip this step. Otherwise,
-complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
-to set up your system before starting training.
-
-Disable NUMA auto-balancing
---------------------------
-
-Generally, application performance can benefit from disabling NUMA auto-balancing. However,
-it might be detrimental to performance with certain types of workloads.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
-Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
-the output is ``1``, run the following command to disable NUMA auto-balancing.
-
-.. code-block:: shell
-
-   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-See :ref:`mi300x-disable-numa` for more information.
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.

 .. _mi300x-amd-megatron-lm-training:

 Environment setup
 =================

-The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
+The prebuilt ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
 training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.

 Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
 image.

 .. _amd-megatron-lm-requirements:
@@ -129,13 +129,13 @@ Download the Docker image

   .. code-block:: shell

-      docker pull rocm/megatron-lm:v25.3
+      docker pull rocm/megatron-lm:v25.4

 2. Launch the Docker container.

   .. code-block:: shell

-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
+      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.4

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

@@ -144,7 +144,8 @@ Download the Docker image
      docker start megatron_training_env
      docker exec -it megatron_training_env bash

-The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
+The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
+(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).

 .. _amd-megatron-lm-environment-setup:

@@ -158,8 +159,8 @@ Configuration scripts

      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
      script in the ``examples/llama`` directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
-      Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__.
+      Likewise, if you're working with Llama 3 or Llama 3.1, use ``train_llama3.sh`` and update
      the configuration script accordingly.

   .. tab-item:: DeepSeek V2
@@ -167,7 +168,7 @@ Configuration scripts

      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
      directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
+      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__
      and update the configuration script accordingly.

 Network interface
@@ -178,23 +179,22 @@ Network interface
   .. tab-item:: Llama
      :sync: llama

-      To avoid connectivity issues in multi-node deployments, ensure the correct network interface
-      is set in your training scripts.
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):

-      1. Run the following command (outside the container) to find the active network interface on your system.
+      .. code-block:: bash

-         .. code-block:: shell
+         ip a

-            ip a
+      Look for an active interface that has an IP address in the same subnet as
+      your other nodes. Then, update the following variables in the script, for
+      example:

-      2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
-         example:
+      .. code-block:: bash

-         .. code-block:: shell
+         export NCCL_SOCKET_IFNAME=ens50f0np0

-            export NCCL_SOCKET_IFNAME=ens50f0np0
-
-            export GLOO_SOCKET_IFNAME=ens50f0np0
+         export GLOO_SOCKET_IFNAME=ens50f0np0

 Dataset options
 ^^^^^^^^^^^^^^^
@@ -219,10 +219,18 @@ Dataset options

           MOCK_DATA=0

-           DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"}  # Change to where your dataset is stored
+           DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored

        Ensure that the files are accessible inside the Docker container.

+        To download the dataset, set the ``DATASET`` variable to the dataset you'd like to use. Two datasets are supported: ``DATASET=wiki`` and ``DATASET=bookcorpus``.
+        Use the following command to download the dataset.
+
+        .. code-block:: shell
+
+           DATASET=wiki bash examples/llama/prepare_dataset.sh # For wiki-en dataset
+           DATASET=bookcorpus bash examples/llama/prepare_dataset.sh # For bookcorpus dataset
+
   .. tab-item:: DeepSeek V2
      :sync: deepseek

@@ -267,15 +275,20 @@ a fixed vocabulary. The tokenizer is trained along with the model on a large cor
 fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
 handle a variety of input sequences, including unseen words or domain-specific terms.

+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded to the default tokenizer model path: ``${DATA_DIR}/tokenizer_llama3``
+or ``${DATA_DIR}/tokenizer_llama2``.
+
 .. tab-set::

   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
-      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
+      Set the Hugging Face model path in the ``TOKENIZER_MODEL`` variable.

      For example, if you're using the Llama 3.1 8B model:

@@ -283,6 +296,20 @@ handle a variety of input sequences, including unseen words or domain-specific t

         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B

+      .. note::
+
+         If you don't already have the Llama 3.1 tokenizer locally, set your
+         personal Hugging Face access token ``HF_TOKEN`` to download the
+         tokenizer. If you encounter the following error, set ``HF_TOKEN`` to
+         your access-authorized Hugging Face token.
+
+         .. code-block:: shell
+
+            OSError: You are trying to access a gated repo.
+
+            # pass your HF_TOKEN
+            export HF_TOKEN=$your_personal_hf_token
+
   .. tab-item:: DeepSeek V2
      :sync: deepseek

@@ -325,9 +352,14 @@ Multi-node training
           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs

      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
-        inside a Docker, either install the drivers inside the Docker container or pass the network
+        inside a Docker container, either install the drivers inside the Docker container or pass the network
        drivers from the host while creating the Docker container.

+        .. code-block:: shell
+
+           # Specify which RDMA interfaces to use for communication
+           export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
 Start training on AMD Instinct accelerators
 ===========================================

@@ -352,12 +384,51 @@ accelerators with the AMD Megatron-LM Docker image.
         .. tab-item:: Single node training
            :sync: single-node

-            To run training on a single node, navigate to the Megatron-LM folder and use the
-            following command:
+            To run training on a single node, navigate to the Megatron-LM folder and use one of the
+            following commands.

-            .. code-block:: shell
+            - For Llama 3.1 8B FP8:

-               TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 3.1 8B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 7B FP8:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            - For Llama 2 7B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            To run training with FSDP2 enabled, add the ``FSDP=1`` argument. For example:
+
+            - For Llama 3 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+            - For Llama 2 70B BF16:
+
+              .. code-block:: shell
+
+                 TEE_OUTPUT=1 MBS=3 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+            .. note::
+
+               It's suggested to use ``TP=1`` when FSDP is enabled for higher throughput. FSDP2 is not supported with pipeline parallelism,
+               expert parallelism, MCore's distributed optimizer, gradient accumulation fusion, and ``FP16`` precision.

         .. tab-item:: Multi-node training
            :sync: multi-node
@@ -385,7 +456,7 @@ accelerators with the AMD Megatron-LM Docker image.
      .. code-block:: shell

         cd /workspace/Megatron-LM
-         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
+         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh

 Key options
 -----------
@@ -403,7 +474,7 @@ The benchmark tests support the following sets of variables:
        ``1`` to enable training logs or ``0`` to disable.

      ``TE_FP8``
-        ``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
+        ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.

      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
@@ -411,6 +482,10 @@ The benchmark tests support the following sets of variables:
      ``USE_FLASH_ATTN``
        ``1`` to enable Flash Attention.

+      ``FSDP``
+        ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+        ``--overlap-param-gather``, and ``--sequence-parallel`` are automaticallyu disabled.
+
      ``ENABLE_PROFILING``
        ``1`` to enable PyTorch profiling for performance analysis.

@@ -424,7 +499,7 @@ The benchmark tests support the following sets of variables:
        The total number of iterations -- ``10`` by default.

      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data provided by you.
+        ``1`` to use mock data or ``0`` to use real data you provide.

      ``MBS``
        Micro batch size.
@@ -433,7 +508,7 @@ The benchmark tests support the following sets of variables:
        Global batch size.

      ``TP``
-        Tensor parallel (``1``, ``2``, ``4``, ``8``).
+        Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.

      ``SEQ_LENGTH``
        Input sequence length.
@@ -447,11 +522,11 @@ The benchmark tests support the following sets of variables:
      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.

-      ``TOTAL_ITERS``
-        The total number of iterations -- ``10`` by default.
+      ``TRAIN_ITERS``
+        The total number of iterations.

      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data provided by you.
+        ``1`` to use mock data or ``0`` to use real data you provide.

      ``MBS``
        Micro batch size.
@@ -459,6 +534,12 @@ The benchmark tests support the following sets of variables:
      ``GBS``
        Global batch size.

+      ``SEQ_LEN``
+        Input sequence length.
+
+      ``AC``
+        Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
 Benchmarking examples
 ---------------------

@@ -527,20 +608,27 @@ Previous versions
 =================

 This table lists previous versions of the ROCm Megatron-LM Docker image for training
-performance validation. For detailed information about available models for
+performance testing. For detailed information about available models for
 benchmarking, see the version-specific documentation.

 .. list-table::
   :header-rows: 1
   :stub-columns: 1

-   * - ROCm version
-     - Megatron-LM version
+   * - Image version
+     - ROCm version
     - PyTorch version
     - Resources

-   * - 6.1
-     - 24.12-dev
+   * - 25.3
+     - 6.3.0
+     - 2.7.0a0+git637433 
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
+
+   * - 24.12-dev
+     - 6.1.0
     - 2.4.0
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -1,5 +1,3 @@
-:orphan:
-
 .. meta::
   :description: How to train a model using PyTorch for ROCm.
   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
@@ -11,7 +9,7 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
+The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.4``) image
 provides a prebuilt optimized environment for fine-tuning and pretraining a
 model on AMD Instinct MI325X and MI300X accelerators. It includes the following
 software components to accelerate training workloads:
@@ -39,12 +37,14 @@ software components to accelerate training workloads:
 Supported models
 ================

-The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.

 * Llama 3.1 8B

 * Llama 3.1 70B

+* Llama 2 70B
+
 * FLUX.1-dev

 .. note::
@@ -54,28 +54,30 @@ The following models are pre-optimized for performance on the AMD Instinct MI300
   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).

+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
 System validation
 =================

-If you have already validated your system settings, skip this step. Otherwise,
-complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
-to set up your system before starting training.
-
-Disable NUMA auto-balancing
---------------------------
-
-Generally, application performance can benefit from disabling NUMA auto-balancing. However,
-it might be detrimental to performance with certain types of workloads.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
-Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
-the output is ``1``, run the following command to disable NUMA auto-balancing.
-
-.. code-block:: shell
-
-   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-See :ref:`mi300x-disable-numa` for more information.
+If you have already validated your system settings, including NUMA
+auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
+and optimization steps <train-a-model-system-validation>` to set up your system
+before starting training.

 Environment setup
 =================
@@ -91,13 +93,13 @@ Download the Docker image

   .. code-block:: shell

-      docker pull rocm/pytorch-training:v25.3
+      docker pull rocm/pytorch-training:v25.4

 2. Run the Docker container.

   .. code-block:: shell

-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.4

 3. Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -106,20 +108,26 @@ Download the Docker image
      docker start training_env
      docker exec -it training_env bash

-4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
+4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+   repository and navigate to the benchmark scripts directory
+   ``/workspace/MAD/scripts/pytorch_train``.

   .. code-block:: shell

      git clone https://github.com/ROCm/MAD
-      cd MAD/scripts/pytorch-train
+      cd MAD/scripts/pytorch_train

 Prepare training datasets and dependencies
 ------------------------------------------

-The following benchmarking examples may require downloading models and datasets
+The following benchmarking examples require downloading models and datasets
 from Hugging Face. To ensure successful access to gated repos, set your
 ``HF_TOKEN``.

+.. code-block:: shell
+
+   export HF_TOKEN=$your_personal_hugging_face_access_token
+
 Run the setup script to install libraries and datasets needed for benchmarking.

 .. code-block:: shell
@@ -229,10 +237,12 @@ Along with the following datasets:

 * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_

+* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
 * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

-Start training on AMD Instinct accelerators
-===========================================
+Getting started
+===============

 The prebuilt PyTorch with ROCm training environment allows users to quickly validate
 system performance, conduct training benchmarks, and achieve superior
@@ -242,7 +252,7 @@ can expect the container to perform in the model configurations described in
 the following section, but other configurations are not validated by AMD.

 Use the following instructions to set up the environment, configure the script
-to train models, and reproduce the benchmark results on MI300X series
+to train models, and reproduce the benchmark results on MI325X and MI300X
 accelerators with the AMD PyTorch training Docker image.

 Once your environment is set up, use the following commands and examples to start benchmarking.
@@ -279,32 +289,59 @@ Options and available models
     - ``finetune_lora``
     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)

+   * -
+     - ``HF_finetune_lora``
+     - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
   * - ``$datatype``
-     - FP8 or BF16
+     - ``FP8`` or ``BF16``
     - Only Llama 3.1 8B supports FP8 precision.

   * - ``$model_repo``
-     - Llama-3.1-8B
+     - ``Llama-3.1-8B``
     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_

   * - 
-     - Llama-3.1-70B
+     - ``Llama-3.1-70B``
     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_

   * - 
-     - Flux
+     - ``Llama-2-70B``
+     - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+   * - 
+     - ``Flux``
     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_

+   * - ``$sequence_length``
+     - Sequence length for the language model.
+     - Between 2048 and 8192. 8192 by default.
+
+.. note::
+
+   Occasionally, downloading the Flux dataset might fail. In the event of this
+   error, manually download it from Hugging Face at
+   `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+   and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+   the required dataset.
+
 Fine-tuning
 -----------

-To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
+To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
 with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.

 .. code-block:: shell

   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B

+Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+.. code-block:: shell
+
+   ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
 Benchmarking examples
 ---------------------

@@ -339,3 +376,32 @@ Here are some examples of how to use the command.
  .. code-block:: shell

     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+* Example 6: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+  .. code-block:: shell
+
+     ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+Previous versions
+=================
+
+This table lists previous versions of the ROCm PyTorch training Docker image for training
+performance validation. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - PyTorch version
+     - Resources
+
+   * - v25.3
+     - 6.3.0
+     - 2.7.0a0+git637433
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
--- a/docs/how-to/system-optimization/index.rst
+++ b/docs/how-to/system-optimization/index.rst
@@ -8,108 +8,21 @@ System optimization
 *******************

 This guide outlines system setup and tuning suggestions for AMD hardware to
-optimize performance for specific types of workloads or use-cases.
+optimize performance for specific types of workloads or use-cases. The contents are structured according to the hardware:

-High-performance computing workloads
-====================================
+.. grid:: 2

-High-performance computing (HPC) workloads have unique requirements. The default
-hardware and BIOS configurations for OEM platforms may not provide optimal
-performance for HPC workloads. To enable optimal HPC settings on a per-platform
-and per-workload level, this chapter describes:
+  .. grid-item-card:: AMD RDNA

-* BIOS settings that can impact performance
-* Hardware configuration best practices
-* Supported versions of operating systems
-* Workload-specific recommendations for optimal BIOS and operating system
-  settings
+    * :doc:`AMD RDNA2 system optimization <w6000-v620>`

-There is also a discussion on the AMD Instinct™ software development
-environment, including information on how to install and run the DGEMM, STREAM,
-HPCG, and HPL benchmarks. This guide provides a good starting point but is
-not tested exhaustively across all compilers.
+  .. grid-item-card:: AMD Instinct

-Knowledge prerequisites to better understand this document and to perform tuning
-for HPC applications include:
+    * `AMD Instinct MI300X <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+    * `AMD Instinct MI300A <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300a.html>`_
+    * `AMD Instinct MI200 <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi200.html>`_
+    * `AMD Instinct MI100 <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi100.html>`_

-* Experience in configuring servers
-* Administrative access to the server's Management Interface (BMC)
-* Administrative access to the operating system
-* Familiarity with the OEM server's BMC (strongly recommended)
-* Familiarity with the OS specific tools for configuration, monitoring, and
-  troubleshooting (strongly recommended)

-This document provides guidance on tuning systems with various AMD Instinct
-accelerators for HPC workloads. The following sections don't comprise an
-all-inclusive guide, and some items referred to may have similar, but different,
-names in various OEM systems (for example, OEM-specific BIOS settings). This
-following sections also provide suggestions on items that should be the initial
-focus of additional, application-specific tuning.

-While this guide is a good starting point, developers are encouraged to perform
-their own performance testing for additional tuning.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - System optimization guide
-
-     - Architecture reference
-
-     - White papers
-
-   * - :doc:`AMD Instinct MI300X <mi300x>`
-
-     - `AMD Instinct MI300 instruction set architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_
-
-     - `CDNA 3 architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf>`_
-
-   * - :doc:`AMD Instinct MI300A <mi300a>`
-
-     - `AMD Instinct MI300 instruction set architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_
-
-     - `CDNA 3 architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf>`_
-
-   * - :doc:`AMD Instinct MI200 <mi200>`
-
-     - `AMD Instinct MI200 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf>`_
-
-     - `CDNA 2 architecture <https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf>`_
-
-   * - :doc:`AMD Instinct MI100 <mi100>`
-
-     - `AMD Instinct MI100 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf>`_
-
-     - `CDNA architecture <https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf>`_
-
-Workstation workloads
-=====================
-
-Workstation workloads, much like those for HPC, have a unique set of
-requirements: a blend of both graphics and compute, certification, stability and
-others.
-
-The document covers specific software requirements and processes needed to use
-these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
-tasks.
-
-The main purpose of this document is to help users utilize the RDNA™ 2 GPUs to
-their full potential.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - System optimization guide
-
-     - Architecture reference
-
-     - White papers
-
-   * - :doc:`AMD Radeon PRO W6000 and V620 <w6000-v620>`
-
-     - `AMD RDNA 2 instruction set architecture <https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf>`_
-
-     - `RDNA 2 architecture <https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf>`_

--- a/docs/how-to/system-optimization/mi100.md
+++ b/docs/how-to/system-optimization/mi100.md
@@ -1,475 +0,0 @@
---
-myst:
-  html_meta:
-    "description": "AMD Instinct MI100 system settings optimization guide."
-    "keywords": "Instinct, MI100, microarchitecture, AMD, ROCm"
---
-
-# AMD Instinct MI100 system optimization
-
-## System settings
-
-This chapter reviews system settings that are required to configure the system
-for AMD Instinct™ MI100 accelerators and that can improve performance of the
-GPUs. It is advised to configure the system for best possible host configuration
-according to the high-performance computing tuning guides for AMD EPYC™
-7002 Series and EPYC™ 7003 Series processors, depending on the processor generation of the
-system.
-
-In addition to the BIOS settings listed below the following settings
-({ref}`mi100-bios-settings`) will also have to be enacted via the command line (see
-{ref}`mi100-os-settings`):
-
-* Core C states
-* AMD-PCI-UTIL (on AMD EPYC™ 7002 series processors)
-* IOMMU (if needed)
-
-(mi100-bios-settings)=
-
-### System BIOS settings
-
-For maximum MI100 GPU performance on systems with AMD EPYC™ 7002 series
-processors (codename "Rome") and AMI System BIOS, the following configuration of
-System BIOS settings has been validated. These settings must be used for the
-qualification process and should be set as default values for the system BIOS.
-Analogous settings for other non-AMI System BIOS providers could be set
-similarly. For systems with Intel processors, some settings may not apply or be
-available as listed in the following table.
-
-```{list-table} Recommended settings for the system BIOS in a GIGABYTE platform.
-:header-rows: 1
-:name: mi100-bios
-
-*
-  - BIOS Setting Location
-  - Parameter
-  - Value
-  - Comments
-*
-  - Advanced / PCI Subsystem Settings
-  - Above 4G Decoding
-  - Enabled
-  - GPU Large BAR Support
-*
-  - AMD CBS / CPU Common Options
-  - Global C-state Control
-  - Auto
-  - Global C-States
-*
-  - AMD CBS / CPU Common Options
-  - CCD/Core/Thread Enablement
-  - Accept
-  - Global C-States
-*
-  - AMD CBS / CPU Common Options / Performance
-  - SMT Control
-  - Disable
-  - Global C-States
-*
-  - AMD CBS / DF Common Options / Memory Addressing
-  - NUMA nodes per socket
-  - NPS 1,2,4
-  - NUMA Nodes (NPS)
-*
-  - AMD CBS / DF Common Options / Memory Addressing
-  - Memory interleaving
-  - Auto
-  - Numa Nodes (NPS)
-*
-  - AMD CBS / DF Common Options / Link
-  - 4-link xGMI max speed
-  - 18 Gbps
-  - Set AMD CPU xGMI speed to highest rate supported
-*
-  - AMD CBS / DF Common Options / Link
-  - 3-link xGMI max speed
-  - 18 Gbps
-  - Set AMD CPU xGMI speed to highest rate supported
-*
-  - AMD CBS / NBIO Common Options
-  - IOMMU
-  - Disable
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - PCIe Ten Bit Tag Support
-  - Enable
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Preferred IO
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Preferred IO Bus
-  - "Use lspci to find pci device id"
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Enhanced Preferred IO Mode
-  - Enable
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Determinism Control
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Determinism Slider
-  - Power
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - cTDP Control
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - cTDP
-  - 240
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Package Power Limit Control
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Package Power Limit
-  - 240
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Link Width Control
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Force Link Width
-  - 2
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Force Link Width Control
-  - Force
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - APBDIS
-  - 1
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - DF C-states
-  - Auto
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Fixed SOC P-state
-  - P0
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options
-  - Enforce POR
-  - Accept
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR
-  - Overclock
-  - Enabled
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR
-  - Memory Clock Speed
-  - 1600 MHz
-  - Set to max Memory Speed, if using 3200 MHz DIMMs
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / DRAM Controller
-    Configuration / DRAM Power Options
-  - Power Down Enable
-  - Disabled
-  - RAM Power Down
-*
-  - AMD CBS / Security
-  - TSME
-  - Disabled
-  - Memory Encryption
-```
-
-#### NBIO link clock frequency
-
-The NBIOs (4x per AMD EPYC™ processor) are the serializers/deserializers (also
-known as "SerDes") that convert and prepare the I/O signals for the processor's
-128 external I/O interface lanes (32 per NBIO).
-
-LCLK (short for link clock frequency) controls the link speed of the internal
-bus that connects the NBIO silicon with the data fabric. All data between the
-processor and its PCIe lanes flow to the data fabric based on these LCLK
-frequency settings. The link clock frequency of the NBIO components need to be
-forced to the maximum frequency for optimal PCIe performance.
-
-For AMD EPYC™ 7002 series processors, this setting cannot be modified via
-configuration options in the server BIOS alone. Instead, the AMD-IOPM-UTIL (see
-Section 3.2.3) must be run at every server boot to disable Dynamic Power
-Management for all PCIe Root Complexes and NBIOs within the system and to lock
-the logic into the highest performance operational mode.
-
-For AMD EPYC™ 7003 series processors, configuring all NBIOs to be in "Enhanced
-Preferred I/O" mode is sufficient to enable highest link clock frequency for the
-NBIO components.
-
-#### Memory configuration
-
-For the memory addressing modes, especially the
-number of NUMA nodes per socket/processor (NPS), the recommended setting is
-to follow the guidance of the high-performance computing tuning guides
-for AMD EPYC™ 7002 Series and AMD EPYC™ 7003 Series processors to provide the optimal
-configuration for host side computation.
-
-If the system is set to one NUMA domain per socket/processor (NPS1),
-bidirectional copy bandwidth between host memory and GPU memory may be
-slightly higher (up to about 16% more) than with four NUMA domains per socket
-processor (NPS4). For memory bandwidth sensitive applications using MPI, NPS4
-is recommended. For applications that are not optimized for NUMA locality,
-NPS1 is the recommended setting.
-
-(mi100-os-settings)=
-
-### Operating system settings
-
-#### CPU core states - C-states
-
-There are several core states (C-states) that an AMD EPYC CPU can idle within:
-
-* C0: active. This is the active state while running an application.
-* C1: idle
-* C2: idle and power gated. This is a deeper sleep state and will have a
-       greater latency when moving back to the C0 state, compared to when the
-       CPU is coming out of C1.
-
-Disabling C2 is important for running with a high performance, low-latency
-network. To disable power-gating on all cores run the following on Linux
-systems:
-
-```shell
-cpupower idle-set -d 2
-```
-
-Note that the `cpupower` tool must be installed, as it is not part of the base
-packages of most Linux® distributions. The package needed varies with the
-respective Linux distribution.
-
-::::{tab-set}
-:::{tab-item} Ubuntu
-:sync: ubuntu
-
-```shell
-sudo apt install linux-tools-common
-```
-
-:::
-
-:::{tab-item} Red Hat Enterprise Linux
-:sync: RHEL
-
-```shell
-sudo yum install cpupowerutils
-```
-
-:::
-
-:::{tab-item} SUSE Linux Enterprise Server
-:sync: SLES
-
-```shell
-sudo zypper install cpupower
-```
-
-:::
-::::
-
-#### AMD-IOPM-UTIL
-
-This section applies to AMD EPYC™ 7002 processors to optimize advanced
-Dynamic Power Management (DPM) in the I/O logic (see NBIO description above)
-for performance. Certain I/O workloads may benefit from disabling this power
-management. This utility disables DPM for all PCI-e root complexes in the
-system and locks the logic into the highest performance operational mode.
-
-Disabling I/O DPM will reduce the latency and/or improve the throughput of
-low-bandwidth messages for PCI-e InfiniBand NICs and GPUs. Other workloads
-with low-bandwidth bursty PCI-e I/O characteristics may benefit as well if
-multiple such PCI-e devices are installed in the system.
-
-The actions of the utility do not persist across reboots. There is no need to
-change any existing firmware settings when using this utility. The "Preferred
-I/O" and "Enhanced Preferred I/O" settings should remain unchanged at enabled.
-
-```{tip}
-The recommended method to use the utility is either to create a system
-start-up script, for example, a one-shot `systemd` service unit, or run the
-utility when starting up a job scheduler on the system. The installer
-packages (see
-[Power Management Utility](https://developer.amd.com/iopm-utility/)) will
-create and enable a `systemd` service unit for you. This service unit is
-configured to run in one-shot mode. This means that even when the service
-unit runs as expected, the status of the service unit will show inactive.
-This is the expected behavior when the utility runs normally. If the service
-unit shows failed, the utility did not run as expected. The output in either
-case can be shown with the `systemctl status` command.
-
-Stopping the service unit has no effect since the utility does not leave
-anything running. To undo the effects of the utility, disable the service
-unit with the `systemctl disable` command and reboot the system.
-
-The utility does not have any command-line options, and it must be run with
-super-user permissions.
-```
-
-#### Systems with 256 CPU threads - IOMMU configuration
-
-For systems that have 256 logical CPU cores or more (e.g., 64-core AMD EPYC™
-7763 in a dual-socket configuration and SMT enabled), setting the input-output
-memory management unit (IOMMU) configuration to "disabled" can limit the number
-of available logical cores to 255. The reason is that the Linux® kernel disables
-X2APIC in this case and falls back to Advanced Programmable Interrupt Controller
-(APIC), which can only enumerate a maximum of 255 (logical) cores.
-
-If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to
-"enable", the following steps can be applied to the system to enable all
-(logical) cores of the system:
-
-* In the server BIOS, set IOMMU to "Enabled".
-* When configuring the Grub boot loader, add the following argument for the
-  Linux kernel: `iommu=pt`
-* Update Grub to use the modified configuration:
-
-  ```shell
-  sudo grub2-mkconfig -o /boot/grub2/grub.cfg
-  ```
-
-* Reboot the system.
-* Verify IOMMU passthrough mode by inspecting the kernel log via `dmesg`:
-
-  ```none
-  [...]
-  [   0.000000] Kernel command line: [...] iommu=pt
-     [...]
-  ```
-
-Once the system is properly configured, ROCm software can be
-installed.
-
-## System management
-
-For a complete guide on how to install/manage/uninstall ROCm on Linux, refer to
-{doc}`Quick-start (Linux)<rocm-install-on-linux:install/quick-start>`. To verify that the installation was
-successful, refer to the
-{doc}`post-install instructions<rocm-install-on-linux:install/post-install>` and
-[system tools](../../reference/rocm-tools.md). Should verification
-fail, consult the [System Debugging Guide](../system-debugging.md).
-
-(mi100-hw-verification)=
-
-### Hardware verification with ROCm
-
-The AMD ROCm™ platform ships with tools to query the system structure. To query
-the GPU hardware, the `rocm-smi` command is available. It can show available
-GPUs in the system with their device ID and their respective firmware (or VBIOS)
-versions:
-
-![rocm-smi --showhw output on an 8*MI100 system](../../data/how-to/tuning-guides/tuning001.png "'rocm-smi --showhw' output on an 8*MI100 system")
-
-Another important query is to show the system structure, the localization of the
-GPUs in the system, and the fabric connections between the system components:
-
-![mi100-smi-showtopo output on an 8*MI100 system](../../data/how-to/tuning-guides/tuning002.png "'mi100-smi-showtopo' output on an 8*MI100 system")
-
-The previous command shows the system structure in four blocks:
-
-* The first block of the output shows the distance between the GPUs similar to
-  what the `numactl` command outputs for the NUMA domains of a system. The
-  weight is a qualitative measure for the "distance" data must travel to reach
-  one GPU from another one. While the values do not carry a special (physical)
-  meaning, the higher the value the more hops are needed to reach the
-  destination from the source GPU.
-* The second block has a matrix for the number of hops required to send data
-  from one GPU to another. For the GPUs in the local hive, this number is one,
-  while for the others it is three (one hop to leave the hive, one hop across
-  the processors, and one hop within the destination hive).
-* The third block outputs the link types between the GPUs. This can either be
-  "XGMI" for AMD Infinity Fabric™ links or "PCIE" for PCIe Gen4 links.
-* The fourth block reveals the localization of a GPU with respect to the NUMA
-  organization of the shared memory of the AMD EPYC™ processors.
-
-To query the compute capabilities of the GPU devices, the `rocminfo` command is
-available with the AMD ROCm™ platform. It lists specific details about the GPU
-devices, including but not limited to the number of compute units, width of the
-SIMD pipelines, memory information, and Instruction Set Architecture:
-
-![rocminfo output fragment on an 8*MI100 system](../../data/how-to/tuning-guides/tuning003.png "rocminfo output fragment on an 8*MI100 system")
-
-For a complete list of architecture (LLVM target) names, refer to
-{doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
-{doc}`Windows<rocm-install-on-windows:reference/system-requirements>` support.
-
-### Testing inter-device bandwidth
-
-{ref}`mi100-hw-verification` showed the `rocm-smi --showtopo` command to show
-how the system structure and how the GPUs are located and connected in this
-structure. For more details, the `rocm-bandwidth-test` can run benchmarks to
-show the effective link bandwidth between the components of the system.
-
-The ROCm Bandwidth Test program can be installed with the following
-package-manager commands:
-
-::::{tab-set}
-:::{tab-item} Ubuntu
-:sync: ubuntu
-
-```shell
-sudo apt install rocm-bandwidth-test
-```
-
-:::
-
-:::{tab-item} Red Hat Enterprise Linux
-:sync: RHEL
-
-```shell
-sudo yum install rocm-bandwidth-test
-```
-
-:::
-
-:::{tab-item} SUSE Linux Enterprise Server
-:sync: SLES
-
-```shell
-sudo zypper install rocm-bandwidth-test
-```
-
-:::
-::::
-
-Alternatively, the source code can be downloaded and built from
-[source](https://github.com/ROCm/rocm_bandwidth_test).
-
-The output will list the available compute devices (CPUs and GPUs):
-
-![rocm-bandwidth-test output fragment on an 8*MI100 system listing devices](../../data/how-to/tuning-guides/tuning004.png "'rocm-bandwidth-test' output fragment on an 8*MI100 system listing devices")
-
-The output will also show a matrix that contains a "1" if a device can
-communicate to another device (CPU and GPU) of the system and it will show the
-NUMA distance (similar to `rocm-smi`):
-
-![rocm-bandwidth-test output fragment on an 8*MI100 system showing inter-device access matrix](../../data/how-to/tuning-guides/tuning005.png "'rocm-bandwidth-test' output fragment on an 8*MI100 system showing inter-device access matrix")
-
-![rocm-bandwidth-test output fragment on an 8*MI100 system showing inter-device NUMA distance](../../data/how-to/tuning-guides/tuning006.png "'rocm-bandwidth-test' output fragment on an 8*MI100 system showing inter-device NUMA distance")
-
-The output also contains the measured bandwidth for unidirectional and
-bidirectional transfers between the devices (CPU and GPU):
-
-![rocm-bandwidth-test output fragment on an 8*MI100 system showing uni- and bidirectional bandwidths](../../data/how-to/tuning-guides/tuning004.png "'rocm-bandwidth-test' output fragment on an 8*MI100 system showing uni- and bidirectional bandwidths")
--- a/docs/how-to/system-optimization/mi200.md
+++ b/docs/how-to/system-optimization/mi200.md
@@ -1,459 +0,0 @@
---
-myst:
-  html_meta:
-    "description": "Learn about AMD Instinct MI200 system settings and performance tuning."
-    "keywords": "Instinct, MI200, microarchitecture, AMD, ROCm"
---
-
-# AMD Instinct MI200 system optimization
-
-## System settings
-
-This chapter reviews system settings that are required to configure the system
-for AMD Instinct MI250 accelerators and improve the performance of the GPUs. It
-is advised to configure the system for the best possible host configuration
-according to the *High Performance Computing (HPC) Tuning Guide for AMD EPYC
-7003 Series Processors*.
-
-Configure the system BIOS settings as explained in {ref}`mi200-bios-settings` and
-enact the below given settings via the command line as explained in
-{ref}`mi200-os-settings`:
-
-* Core C states
-* input-output memory management unit (IOMMU), if needed
-
-(mi200-bios-settings)=
-
-### System BIOS settings
-
-For maximum MI250 GPU performance on systems with AMD EPYC™ 7003-series
-processors (codename "Milan") and AMI System BIOS, the following configuration
-of system BIOS settings has been validated. These settings must be used for the
-qualification process and should be set as default values for the system BIOS.
-Analogous settings for other non-AMI System BIOS providers could be set
-similarly. For systems with Intel processors, some settings may not apply or be
-available as listed in the following table.
-
-```{list-table}
-:header-rows: 1
-:name: mi200-bios
-
-*
-  - BIOS Setting Location
-  - Parameter
-  - Value
-  - Comments
-*
-  - Advanced / PCI Subsystem Settings
-  - Above 4G Decoding
-  - Enabled
-  - GPU Large BAR Support
-*
-  - Advanced / PCI Subsystem Settings
-  - SR-IOV Support
-  - Disabled
-  - Disable Single Root IO Virtualization
-*
-  - AMD CBS / CPU Common Options
-  - Global C-state Control
-  - Auto
-  - Global C-States
-*
-  - AMD CBS / CPU Common Options
-  - CCD/Core/Thread Enablement
-  - Accept
-  - Global C-States
-*
-  - AMD CBS / CPU Common Options / Performance
-  - SMT Control
-  - Disable
-  - Global C-States
-*
-  - AMD CBS / DF Common Options / Memory Addressing
-  - NUMA nodes per socket
-  - NPS 1,2,4
-  - NUMA Nodes (NPS)
-*
-  - AMD CBS / DF Common Options / Memory Addressing
-  - Memory interleaving
-  - Auto
-  - Numa Nodes (NPS)
-*
-  - AMD CBS / DF Common Options / Link
-  - 4-link xGMI max speed
-  - 18 Gbps
-  - Set AMD CPU xGMI speed to highest rate supported
-*
-  - AMD CBS / NBIO Common Options
-  - IOMMU
-  - Disable
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - PCIe Ten Bit Tag Support
-  - Auto
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Preferred IO
-  - Bus
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Preferred IO Bus
-  - "Use lspci to find pci device id"
-  -
-*
-  - AMD CBS / NBIO Common Options
-  - Enhanced Preferred IO Mode
-  - Enable
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Determinism Control
-  - Manual
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Determinism Slider
-  - Power
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - cTDP Control
-  - Manual
-  - Set cTDP to the maximum supported by the installed CPU
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - cTDP
-  - 280
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Package Power Limit Control
-  - Manual
-  - Set Package Power Limit to the maximum supported by the installed CPU
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Package Power Limit
-  - 280
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Link Width Control
-  - Manual
-  - Set AMD CPU xGMI width to 16 bits
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Force Link Width
-  - 2
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - xGMI Force Link Width Control
-  - Force
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - APBDIS
-  - 1
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - DF C-states
-  - Enabled
-  -
-*
-  - AMD CBS / NBIO Common Options / SMU Common Options
-  - Fixed SOC P-state
-  - P0
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options
-  - Enforce POR
-  - Accept
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR
-  - Overclock
-  - Enabled
-  -
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR
-  - Memory Clock Speed
-  - 1600 MHz
-  - Set to max Memory Speed, if using 3200 MHz DIMMs
-*
-  - AMD CBS / UMC Common Options / DDR4 Common Options / DRAM Controller
-    Configuration / DRAM Power Options
-  - Power Down Enable
-  - Disabled
-  - RAM Power Down
-*
-  - AMD CBS / Security
-  - TSME
-  - Disabled
-  - Memory Encryption
-```
-
-#### NBIO link clock frequency
-
-The NBIOs (4x per AMD EPYC™ processor) are the serializers/deserializers (also
-known as "SerDes") that convert and prepare the I/O signals for the processor's
-128 external I/O interface lanes (32 per NBIO).
-
-LCLK (short for link clock frequency) controls the link speed of the internal
-bus that connects the NBIO silicon with the data fabric. All data between the
-processor and its PCIe lanes flow to the data fabric based on these LCLK
-frequency settings. The link clock frequency of the NBIO components need to be
-forced to the maximum frequency for optimal PCIe performance.
-
-For AMD EPYC™ 7003 series processors, configuring all NBIOs to be in "Enhanced
-Preferred I/O" mode is sufficient to enable highest link clock frequency for the
-NBIO components.
-
-#### Memory configuration
-
-For setting the memory addressing modes, especially
-the number of NUMA nodes per socket/processor (NPS), follow the guidance of the
-"High Performance Computing (HPC) Tuning Guide for AMD EPYC 7003 Series
-Processors" to provide the optimal configuration for host side computation. For
-most HPC workloads, NPS=4 is the recommended value.
-
-(mi200-os-settings)=
-
-### Operating system settings
-
-#### CPU core states - C-states
-
-There are several core states (C-states) that an AMD EPYC CPU can idle within:
-
-* C0: active. This is the active state while running an application.
-* C1: idle
-* C2: idle and power gated. This is a deeper sleep state and will have a
-       greater latency when moving back to the C0 state, compared to when the
-       CPU is coming out of C1.
-
-Disabling C2 is important for running with a high performance, low-latency
-network. To disable power-gating on all cores run the following on Linux
-systems:
-
-```shell
-cpupower idle-set -d 2
-```
-
-Note that the `cpupower` tool must be installed, as it is not part of the base
-packages of most Linux® distributions. The package needed varies with the
-respective Linux distribution.
-
-::::{tab-set}
-:::{tab-item} Ubuntu
-:sync: ubuntu
-
-```shell
-sudo apt install linux-tools-common
-```
-
-:::
-
-:::{tab-item} Red Hat Enterprise Linux
-:sync: RHEL
-
-```shell
-sudo yum install cpupowerutils
-```
-
-:::
-
-:::{tab-item} SUSE Linux Enterprise Server
-:sync: SLES
-
-```shell
-sudo zypper install cpupower
-```
-
-:::
-::::
-
-#### AMD-IOPM-UTIL
-
-This section applies to AMD EPYC™ 7002 processors to optimize advanced
-Dynamic Power Management (DPM) in the I/O logic (see NBIO description above)
-for performance. Certain I/O workloads may benefit from disabling this power
-management. This utility disables DPM for all PCI-e root complexes in the
-system and locks the logic into the highest performance operational mode.
-
-Disabling I/O DPM will reduce the latency and/or improve the throughput of
-low-bandwidth messages for PCI-e InfiniBand NICs and GPUs. Other workloads
-with low-bandwidth bursty PCI-e I/O characteristics may benefit as well if
-multiple such PCI-e devices are installed in the system.
-
-The actions of the utility do not persist across reboots. There is no need to
-change any existing firmware settings when using this utility. The "Preferred
-I/O" and "Enhanced Preferred I/O" settings should remain unchanged at enabled.
-
-```{tip}
-The recommended method to use the utility is either to create a system
-start-up script, for example, a one-shot `systemd` service unit, or run the
-utility when starting up a job scheduler on the system. The installer
-packages (see
-[Power Management Utility](https://developer.amd.com/iopm-utility/)) will
-create and enable a `systemd` service unit for you. This service unit is
-configured to run in one-shot mode. This means that even when the service
-unit runs as expected, the status of the service unit will show inactive.
-This is the expected behavior when the utility runs normally. If the service
-unit shows failed, the utility did not run as expected. The output in either
-case can be shown with the `systemctl status` command.
-
-Stopping the service unit has no effect since the utility does not leave
-anything running. To undo the effects of the utility, disable the service
-unit with the `systemctl disable` command and reboot the system.
-
-The utility does not have any command-line options, and it must be run with
-super-user permissions.
-```
-
-#### Systems with 256 CPU threads - IOMMU configuration
-
-For systems that have 256 logical CPU cores or more (e.g., 64-core AMD EPYC™
-7763 in a dual-socket configuration and SMT enabled), setting the input-output
-memory management unit (IOMMU) configuration to "disabled" can limit the number
-of available logical cores to 255. The reason is that the Linux® kernel disables
-X2APIC in this case and falls back to Advanced Programmable Interrupt Controller
-(APIC), which can only enumerate a maximum of 255 (logical) cores.
-
-If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to
-"enable", the following steps can be applied to the system to enable all
-(logical) cores of the system:
-
-* In the server BIOS, set IOMMU to "Enabled".
-* When configuring the Grub boot loader, add the following argument for the
-  Linux kernel: `iommu=pt`
-* Update Grub to use the modified configuration:
-
-  ```shell
-  sudo grub2-mkconfig -o /boot/grub2/grub.cfg
-  ```
-
-* Reboot the system.
-* Verify IOMMU passthrough mode by inspecting the kernel log via `dmesg`:
-
-  ```none
-  [...]
-  [   0.000000] Kernel command line: [...] iommu=pt
-     [...]
-  ```
-
-Once the system is properly configured, ROCm software can be
-installed.
-
-## System management
-
-For a complete guide on how to install/manage/uninstall ROCm on Linux, refer to
-{doc}`Quick-start (Linux)<rocm-install-on-linux:install/quick-start>`. For verifying that the
-installation was successful, refer to the
-{doc}`post-install instructions<rocm-install-on-linux:install/post-install>` and
-[system tools](../../reference/rocm-tools.md). Should verification
-fail, consult the [System Debugging Guide](../system-debugging.md).
-
-(mi200-hw-verification)=
-
-### Hardware verification with ROCm
-
-The AMD ROCm™ platform ships with tools to query the system structure. To query
-the GPU hardware, the `rocm-smi` command is available. It can show available
-GPUs in the system with their device ID and their respective firmware (or VBIOS)
-versions:
-
-![rocm-smi --showhw output on an 8*MI200 system](../../data/how-to/tuning-guides/tuning008.png "'rocm-smi --showhw' output on an 8*MI200 system")
-
-To see the system structure, the localization of the GPUs in the system, and the
-fabric connections between the system components, use:
-
-![rocm-smi --showtopo output on an 8*MI200 system](../../data/how-to/tuning-guides/tuning009.png "'rocm-smi --showtopo' output on an 8*MI200 system")
-
-* The first block of the output shows the distance between the GPUs similar to
-  what the `numactl` command outputs for the NUMA domains of a system. The
-  weight is a qualitative measure for the "distance" data must travel to reach
-  one GPU from another one. While the values do not carry a special (physical)
-  meaning, the higher the value the more hops are needed to reach the
-  destination from the source GPU.
-* The second block has a matrix named "Hops between two GPUs", where 1 means the
-  two GPUs are directly connected with XGMI, 2 means both GPUs are linked to the
-  same CPU socket and GPU communications will go through the CPU, and 3 means
-  both GPUs are linked to different CPU sockets so communications will go
-  through both CPU sockets. This number is one for all GPUs in this case since
-  they are all connected to each other through the Infinity Fabric links.
-* The third block outputs the link types between the GPUs. This can either be
-  "XGMI" for AMD Infinity Fabric links or "PCIE" for PCIe Gen4 links.
-* The fourth block reveals the localization of a GPU with respect to the NUMA
-  organization of the shared memory of the AMD EPYC processors.
-
-To query the compute capabilities of the GPU devices, use `rocminfo` command. It
-lists specific details about the GPU devices, including but not limited to the
-number of compute units, width of the SIMD pipelines, memory information, and
-Instruction Set Architecture (ISA):
-
-![rocminfo output fragment on an 8*MI200 system](../../data/how-to/tuning-guides/tuning010.png "'rocminfo' output fragment on an 8*MI200 system")
-
-For a complete list of architecture (LLVM target) names, refer to GPU OS Support for
-{doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
-{doc}`Windows<rocm-install-on-windows:reference/system-requirements>`.
-
-### Testing inter-device bandwidth
-
-{ref}`mi100-hw-verification` showed the `rocm-smi --showtopo` command to show
-how the system structure and how the GPUs are located and connected in this
-structure. For more details, the `rocm-bandwidth-test` can run benchmarks to
-show the effective link bandwidth between the components of the system.
-
-The ROCm Bandwidth Test program can be installed with the following
-package-manager commands:
-
-::::{tab-set}
-:::{tab-item} Ubuntu
-:sync: ubuntu
-
-```shell
-sudo apt install rocm-bandwidth-test
-```
-
-:::
-
-:::{tab-item} Red Hat Enterprise Linux
-:sync: RHEL
-
-```shell
-sudo yum install rocm-bandwidth-test
-```
-
-:::
-
-:::{tab-item} SUSE Linux Enterprise Server
-:sync: SLES
-
-```shell
-sudo zypper install rocm-bandwidth-test
-```
-
-:::
-::::
-
-Alternatively, the source code can be downloaded and built from
-[source](https://github.com/ROCm/rocm_bandwidth_test).
-
-The output will list the available compute devices (CPUs and GPUs), including
-their device ID and PCIe ID:
-
-![rocm-bandwidth-test output fragment on an 8*MI200 system listing devices](../../data/how-to/tuning-guides/tuning011.png "'rocm-bandwidth-test' output fragment on an 8*MI200 system listing devices")
-
-The output will also show a matrix that contains a "1" if a device can
-communicate to another device (CPU and GPU) of the system and it will show the
-NUMA distance (similar to `rocm-smi`):
-
-!['rocm-bandwidth-test' output fragment on an 8*MI200 system showing inter-device access matrix and NUMA distances](../../data/how-to/tuning-guides/tuning012.png "'rocm-bandwidth-test' output fragment on an 8*MI200 system showing inter-device access matrix and NUMA distances")
-
-The output also contains the measured bandwidth for unidirectional and
-bidirectional transfers between the devices (CPU and GPU):
-
-!['rocm-bandwidth-test' output fragment on an 8*MI200 system showing uni- and bidirectional bandwidths](../../data/how-to/tuning-guides/tuning013.png "'rocm-bandwidth-test' output fragment on an 8*MI200 system showing uni- and bidirectional bandwidths")
--- a/docs/how-to/system-optimization/mi300a.rst
+++ b/docs/how-to/system-optimization/mi300a.rst
@@ -1,452 +0,0 @@
-.. meta::
-   :description: Learn about AMD Instinct MI300A system settings and performance tuning.
-   :keywords: AMD, Instinct, MI300A, HPC, tuning, BIOS settings, NBIO, ROCm,
-              environment variable, performance, accelerator, GPU, EPYC, GRUB,
-              operating system
-
-***************************************************
-AMD Instinct MI300A system optimization
-***************************************************
-
-This topic discusses the operating system settings and system management commands for 
-the AMD Instinct MI300A accelerator. This topic can help you optimize performance.
-
-System settings
-========================================
-
-This section reviews the system settings required to configure a MI300A SOC system and
-optimize its performance.
-
-The MI300A system-on-a-chip (SOC) design requires you to review and potentially adjust your OS configuration as explained in 
-the :ref:`operating-system-settings-label` section. These settings are critical for 
-performance because the OS on an accelerated processing unit (APU) is responsible for memory management across the CPU and GPU accelerators.
-In the APU memory model, system settings are available to limit GPU memory allocation. 
-This limit is important because legacy software often determines the 
-amount of allowable memory at start-up time
-by probing discrete memory until it is exhausted. If left unchecked, this practice 
-can starve the OS of resources. 
-
-System BIOS settings
-----------------------------------
-
-System BIOS settings are preconfigured for optimal performance from the 
-platform vendor. This means that you do not need to adjust these settings 
-when using MI300A. If you have any questions regarding these settings, 
-contact your MI300A platform vendor.
-
-GRUB settings 
-----------------------------------
-
-The ``/etc/default/grub`` file is used to configure the GRUB bootloader on modern Linux distributions. 
-Linux uses the string assigned to ``GRUB_CMDLINE_LINUX`` in this file as
-its command line parameters during boot.
-
-Appending strings using the Linux command line
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-It is recommended that you append the following string to ``GRUB_CMDLINE_LINUX``.
-
-``pci=realloc=off``
-  This setting disables the automatic reallocation
-  of PCI resources, so Linux is able to unambiguously detect all GPUs on the
-  MI300A-based system. It's used when Single Root I/O Virtualization (SR-IOV) Base
-  Address Registers (BARs) have not been allocated by the BIOS. This can help
-  avoid potential issues with certain hardware configurations.
-
-Validating the IOMMU setting
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-IOMMU is a system-specific IO mapping mechanism for DMA mapping
-and isolation. IOMMU is turned off by default in the operating system settings 
-for optimal performance.
-
-To verify IOMMU is turned off, first install the ``acpica-tools`` package using your 
-package manager.
-
-.. code-block:: shell
-
-   sudo apt install acpica-tools
-
-Then confirm that the following commands do not return any results.
-
-.. code-block:: shell
-
-   sudo acpidump | grep IVRS
-   sudo acpidump | grep DMAR
-
-Update GRUB
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Use this command to update GRUB to use the modified configuration:
-
-.. code-block:: shell
-
-   sudo grub2-mkconfig -o /boot/grub2/grub.cfg
-
-On some Red Hat-based systems, the ``grub2-mkconfig`` command might not be available. In this case,
-use ``grub-mkconfig`` instead. Verify that you have the
-correct version by using the following command:
-
-.. code-block:: shell
-
-   grub-mkconfig -version
-
-.. _operating-system-settings-label:
-
-Operating system settings 
-----------------------------------
-
-The operating system provides several options to customize and tune performance. For more information 
-about supported operating systems, see the :doc:`Compatibility matrix <../../compatibility/compatibility-matrix>`. 
- 
-If you are using a distribution other than RHEL or SLES, the latest Linux kernel is recommended.
-Performance considerations for the Zen4, which is the core architecture in the MI300A, 
-require a Linux kernel running version 5.18 or higher. 
-
-This section describes performance-based settings.
-
-* **Enable transparent huge pages** 
-
-  To enable transparent huge pages, use one of the following methods:
-
-  * From the command line, run the following command:
-  
-    .. code-block:: shell
-
-       echo always > /sys/kernel/mm/transparent_hugepage/enabled  
-
-  * Set the Linux kernel parameter ``transparent_hugepage`` as follows in the 
-    relevant ``.cfg`` file for your system.
-
-    .. code-block:: cfg
-
-       transparent_hugepage=always
-
-* **Increase the amount of allocatable memory**
-
-  By default, when using a device allocator via HIP, it is only possible to allocate 96 GiB out of 
-  a possible 128 GiB of memory on the MI300A. This limitation does not affect host allocations.
-  To increase the available system memory, load the ``amdttm`` module with new values for
-  ``pages_limit`` and ``page_pool_size``. These numbers correspond to the number of 4 KiB pages of memory.
-  To make 128 GiB of memory available across all four devices, for a total amount of 512 GiB,
-  set ``pages_limit`` and ``page_pool_size`` to ``134217728``. For a two-socket system, divide these values
-  by two. After setting these values, reload the AMDGPU driver.
-
-  First, review the current settings using this shell command:
-
-  .. code-block:: shell
-
-     cat /sys/module/amdttm/parameters/pages_limit 
-  
-  To set the amount of allocatable memory to all available memory on all four APU devices, run these commands:
-
-  .. code-block:: shell
-
-     sudo modprobe amdttm pages_limit=134217728 page_pool_size=134217728
-     sudo modprobe amdgpu
-
-  These settings can also be hardcoded in the ``/etc/modprobe.d/amdttm.conf`` file or specified as boot
-  parameters.
-  
-  To use the hardcoded method, 
-  the filesystem must already be set up when the kernel driver is loaded.
-  To hardcode the settings, add the following lines to ``/etc/modprobe.d/amdttm.conf``:
-
-  .. code-block:: shell
-
-     options amdttm pages_limit=134217728
-     options amdttm page_pool_size=134217728
-
-  If the filesystem is not already set up when the kernel driver is loaded, then the options
-  must be specified as boot parameters. To specify the settings
-  as boot parameters when loading the kernel, use this example as a guideline:
-
-  .. code-block:: shell
-
-     vmlinux-[...] amdttm.pages_limit=134217728 amdttm.page_pool_size=134217728 [...]
-
-  To verify the new settings and confirm the change, use this command:
-
-  .. code-block:: shell
-
-     cat /sys/module/amdttm/parameters/pages_limit 
-
-  .. note::
-
-     The system settings for ``pages_limit`` and ``page_pool_size`` are calculated by multiplying the
-     per-APU limit of 4 KiB pages, which is ``33554432``, by the number of APUs on the node. The limit for a system with
-     two APUs ``33554432 x 2`` or ``67108864``.
-     This means the ``modprobe`` command for two APUs is ``sudo modprobe amdttm pages_limit=67108864 page_pool_size=67108864``.
-
-* **Limit the maximum and single memory allocations on the GPU**
-  
-  Many AI-related applications were originally developed on discrete GPUs. Some of these applications 
-  have fixed problem sizes associated with the targeted GPU size, and some attempt to determine the 
-  system memory limits by allocating chunks until failure. These techniques can cause issues in an 
-  APU with a shared space.
-  
-  To allow these applications to run on the APU without further changes, 
-  ROCm supports a default memory policy that restricts the percentage of the GPU that can be allocated. 
-  The following environment variables control this feature: 
-
-  * ``GPU_MAX_ALLOC_PERCENT``
-  * ``GPU_SINGLE_ALLOC_PERCENT``
-
-  These settings can be added to the default shell environment or the user environment. The effect of the memory allocation 
-  settings varies depending on the system, configuration, and task. They might require adjustment, especially when performing GPU benchmarks. Setting these values to ``100`` 
-  lets the GPU allocate any amount of free memory. However, the risk of encountering 
-  an operating system out-of-memory (OMM) condition increases when almost 
-  all the available memory is used.
-  
-  Before setting either of these items to 100 percent, 
-  carefully consider the expected CPU workload allocation and the anticipated OS usage. 
-  For instance, if the OS requires 8GB on a 128GB system, setting these 
-  variables to ``100`` authorizes a single 
-  workload to allocate up to 120GB of memory. Unless the system has swap space configured 
-  any over-allocation attempts will be handled by the OMM policies.
-
-* **Disable NUMA (Non-uniform memory access) balancing**
-  
-  ROCm uses information from the compiled application to ensure an affinity exists
-  between the GPU agent processes and their CPU hosts or co-processing agents. 
-  Because the APU has OS threads, 
-  including threads with memory management, the default kernel NUMA policies can
-  adversely impact workload performance without additional tuning.
-
-  .. note::
-
-     At the kernel level, ``pci_relloc`` can also be set to ``off`` as an additional tuning measure. 
-
-  To disable NUMA balancing, use one of the following methods:
-
-  * From the command line, run the following command:
-  
-    .. code-block:: shell
-
-       echo 0 > /proc/sys/kernel/numa_balancing   
-
-  * Set the following Linux kernel parameters in the 
-    relevant ``.cfg`` file for your system.
-
-    .. code-block:: cfg
-
-       pci=realloc=off numa_balancing=disable  
-
-* **Enable compaction**
-
-  Compaction is necessary for proper MI300A operation because the APU dynamically shares memory 
-  between the CPU and GPU. Compaction can be done proactively, which reduces 
-  allocation costs, or performed during allocation, in which case it is part of the background activities. 
-  Without compaction, the MI300A application performance eventually degrades as fragmentation increases. 
-  In RHEL distributions, compaction is disabled by default. In Ubuntu, it's enabled by default. 
-
-  To enable compaction, enter the following commands using the command line:
-  
-  .. code-block:: shell
-
-     echo 20 > /proc/sys/vm/compaction_proactiveness 
-     echo 1 > /proc/sys/vm/compact_unevictable_allowed  
-
-.. _mi300a-processor-affinity:
-
-* **Change affinity of ROCm helper threads**
-  
-  Changing the affinity prevents internal ROCm threads from having their CPU core affinity mask 
-  set to all CPU cores available. With this setting, the threads inherit their parent's 
-  CPU core affinity mask. Before adjusting this setting, ensure you thoroughly understand 
-  your system topology and how the application, runtime environment, and batch system
-  set the thread-to-core affinity. If you have any questions regarding this setting, 
-  contact your MI300A platform vendor or the AMD support team. 
-  To enable this setting, enter the following command:
-
-  .. code-block:: shell
-
-     export HSA_OVERRIDE_CPU_AFFINITY_DEBUG=0 
-
-* **CPU core states and C-states**
-
-  The system BIOS handles these settings for the MI300A. 
-  They don't need to be configured on the operating system.
-
-System management
-========================================
-
-For a complete guide on installing, managing, and uninstalling ROCm on Linux, see
-:doc:`Quick-start (Linux)<rocm-install-on-linux:install/quick-start>`. To verify that the
-installation was successful, see the
-:doc:`Post-installation instructions<rocm-install-on-linux:install/post-install>` and 
-:doc:`ROCm tools <../../reference/rocm-tools>` guides. If verification
-fails, consult the :doc:`System debugging guide <../system-debugging>`.
-
-.. _hw-verification-rocm-label:
-
-Hardware verification with ROCm 
-----------------------------------
-
-ROCm includes tools to query the system structure. To query
-the GPU hardware, use the ``rocm-smi`` command.
-
-``rocm-smi`` reports statistics per socket, so the power results combine CPU and GPU utilization. 
-In an idle state on a multi-socket system, some power imbalances are expected because 
-the distribution of OS threads can keep some APU devices at higher power states.
-
-.. note::
-
-   The MI300A VRAM settings show as ``N/A``. 
-
-.. image:: ../../data/how-to/tuning-guides/mi300a-rocm-smi-output.png
-   :alt: Output from the rocm-smi command
-
-The ``rocm-smi --showhw`` command shows the available system
-GPUs and their device ID and firmware details.
-
-In the MI300A hardware settings, the system BIOS handles the UMC RAS. The 
-ROCm-supplied GPU driver does not manage this setting.
-This results in a value of ``DISABLED`` for the ``UMC RAS`` setting. 
-
-.. image:: ../../data/how-to/tuning-guides/mi300a-rocm-smi-showhw-output.png
-   :alt: Output from the ``rocm-smi showhw`` command
-
-To see the system structure, the localization of the GPUs in the system, and the 
-fabric connections between the system components, use the ``rocm-smi --showtopo`` command.
-
-* The first block of the output shows the distance between the GPUs. The weight is a qualitative 
-  measure of the “distance” data must travel to reach one GPU from another. 
-  While the values do not have a precise physical meaning, the higher the value the 
-  more hops are required to reach the destination from the source GPU.
-* The second block contains a matrix named “Hops between two GPUs”, where ``1`` means 
-  the two GPUs are directly connected with XGMI, ``2`` means both GPUs are linked to the 
-  same CPU socket and GPU communications go through the CPU, and ``3`` means 
-  both GPUs are linked to different CPU sockets so communications go 
-  through both CPU sockets.
-* The third block indicates the link types between the GPUs. This can either be 
-  ``XGMI`` for AMD Infinity Fabric links or ``PCIE`` for PCIe Gen4 links.
-* The fourth block reveals the localization of a GPU with respect to the NUMA organization 
-  of the shared memory of the AMD EPYC processors.
-
-.. image:: ../../data/how-to/tuning-guides/mi300a-rocm-smi-showtopo-output.png
-   :alt: Output from the ``rocm-smi showtopo`` command
-
-Testing inter-device bandwidth
-----------------------------------
-
-The ``rocm-smi --showtopo`` command from the :ref:`hw-verification-rocm-label` section 
-displays the system structure and shows how the GPUs are located and connected within this
-structure. For more information, use the :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`, which can run benchmarks to
-show the effective link bandwidth between the system components.
-
-For information on how to install the ROCm Bandwidth Test, see :doc:`Building the environment <rocm_bandwidth_test:install/install>`.
-
-The output lists the available compute devices (CPUs and GPUs), including
-their device ID and PCIe ID:
-
-.. image:: ../../data/how-to/tuning-guides/mi300a-rocm-bandwidth-test-output.png
-   :alt: Output from the rocm-bandwidth-test utility
-
-It also displays the measured bandwidth for unidirectional and
-bidirectional transfers between the devices on the CPU and GPU:
-
-.. image:: ../../data/how-to/tuning-guides/mi300a-rocm-peak-bandwidth-output.png
-   :alt: Bandwidth information from the rocm-bandwidth-test utility
-
-Abbreviations
-=============
-
-APBDIS
-  Algorithmic Performance Boost Disable
-
-APU
-  Accelerated processing unit
-
-BAR
-  Base Address Register
-
-BIOS
-  Basic Input/Output System
-
-CBS
-  Common BIOS Settings
-
-CCD
-  Compute Core Die
-
-CDNA
-  Compute DNA
-
-CLI
-  Command Line Interface
-
-CPU
-  Central Processing Unit
-
-cTDP
-  Configurable Thermal Design Power
-
-DF
-  Data Fabric
-
-DMA
-  Direct Memory Access
-
-GPU
-  Graphics Processing Unit
-
-GRUB
-  Grand Unified Bootloader
-
-HBM
-  High Bandwidth Memory
-
-HPC
-  High Performance Computing
-
-IOMMU
-  Input-Output Memory Management Unit
-
-ISA
-  Instruction Set Architecture
-
-NBIO
-  North Bridge Input/Output
-
-NUMA
-  Non-Uniform Memory Access
-
-OMM
-  Out of Memory
-
-PCI
-  Peripheral Component Interconnect
-
-PCIe
-  PCI Express
-
-POR
-  Power-On Reset
-
-RAS
-  Reliability, availability and serviceability
-
-SMI
-  System Management Interface
-
-SMT
-  Simultaneous Multi-threading
-
-SOC
-  System On Chip
-
-SR-IOV
-  Single Root I/O Virtualization
-
-TSME
-  Transparent Secure Memory Encryption
-
-UMC
-  Unified Memory Controller
-
-VRAM
-  Video RAM
-
-xGMI
-  Inter-chip Global Memory Interconnect 
--- a/docs/how-to/system-optimization/mi300x.rst
+++ b/docs/how-to/system-optimization/mi300x.rst
@@ -1,818 +0,0 @@
-.. meta::
-   :description: Learn about AMD Instinct MI300X system settings and performance tuning.
-   :keywords: AMD, Instinct, MI300X, HPC, tuning, BIOS settings, NBIO, ROCm,
-              environment variable, performance, accelerator, GPU, EPYC, GRUB,
-              operating system
-
-***************************************
-AMD Instinct MI300X system optimization
-***************************************
-
-This document covers essential system settings and management practices required
-to configure your system effectively. Ensuring that your system operates
-correctly is the first step before delving into advanced performance tuning.
-
-The main topics of discussion in this document are:
-
-* :ref:`System settings <mi300x-system-settings>`
-
-  * :ref:`System BIOS settings <mi300x-bios-settings>`
-
-  * :ref:`GRUB settings <mi300x-grub-settings>`
-
-  * :ref:`Operating system settings <mi300x-os-settings>`
-
-* :ref:`System management <mi300x-system-management>`
-
-.. _mi300x-system-settings:
-
-System settings
-===============
-
-This guide discusses system settings that are required to configure your system
-for AMD Instinct™ MI300X accelerators. It is important to ensure a system is
-functioning correctly before trying to improve its overall performance. In this
-section, the settings discussed mostly ensure proper functionality of your
-Instinct-based system. Some settings discussed are known to improve performance
-for most applications running on a MI300X system. See
-:doc:`../rocm-for-ai/inference-optimization/workload` for how to improve performance for
-specific applications or workloads.
-
-.. _mi300x-bios-settings:
-
-System BIOS settings
--------------------
-
-AMD EPYC 9004-based systems
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-For maximum MI300X GPU performance on systems with AMD EPYC™ 9004-series
-processors and AMI System BIOS, the following configuration
-of system BIOS settings has been validated. These settings must be used for the
-qualification process and should be set as default values in the system BIOS.
-Analogous settings for other non-AMI System BIOS providers could be set
-similarly. For systems with Intel processors, some settings may not apply or be
-available as listed in the following table.
-
-Each row in the table details a setting but the specific location within the
-BIOS setup menus may be different, or the option may not be present. 
-
-.. list-table::
-   :header-rows: 1
-
-   * - BIOS setting location
-
-     - Parameter
-
-     - Value
-
-     - Comments
-
-   * - Advanced / PCI subsystem settings
-
-     - Above 4G decoding
-
-     - Enabled
-
-     - GPU large BAR support.
-
-   * - Advanced / PCI subsystem settings
-
-     - SR-IOV support
-
-     - Enabled
-
-     - Enable single root IO virtualization.
-
-   * - AMD CBS / GPU common options
-
-     - Global C-state control
-
-     - Auto
-
-     - Global C-states -- do not disable this menu item).
-
-   * - AMD CBS / GPU common options
-
-     - CCD/Core/Thread enablement
-
-     - Accept
-
-     - May be necessary to enable the SMT control menu.
-
-   * - AMD CBS / GPU common options / performance
-
-     - SMT control
-
-     - Disable
-
-     - Set to Auto if the primary application is not compute-bound.
-
-   * - AMD CBS / DF common options / memory addressing
-
-     - NUMA nodes per socket
-
-     - Auto
-
-     - Auto = NPS1. At this time, the other options for NUMA nodes per socket
-       should not be used.
-
-   * - AMD CBS / DF common options / memory addressing
-
-     - Memory interleaving
-
-     - Auto
-
-     - Depends on NUMA nodes (NPS) setting.
-
-   * - AMD CBS / DF common options / link
-
-     - 4-link xGMI max speed
-
-     - 32 Gbps
-
-     - Auto results in the speed being set to the lower of the max speed the
-       motherboard is designed to support and the max speed of the CPU in use.
-
-   * - AMD CBS / NBIO common options
-
-     - IOMMU
-
-     - Enabled
-
-     - 
-
-   * - AMD CBS / NBIO common options
-
-     - PCIe ten bit tag support
-
-     - Auto
-
-     - 
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - Determinism control
-
-     - Manual
-
-     - 
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - Determinism slider
-
-     - Power
-
-     - 
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - cTDP control
-
-     - Manual
-
-     - Set cTDP to the maximum supported by the installed CPU.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - cTDP
-
-     - 400
-
-     - Value in watts.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - Package power limit control
-
-     - Manual
-
-     - Set package power limit to the maximum supported by the installed CPU.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - Package power limit
-
-     - 400
-
-     - Value in watts.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - xGMI link width control
-
-     - Manual
-
-     - Set package power limit to the maximum supported by the installed CPU.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - xGMI force width control
-
-     - Force
-
-     - 
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - xGMI force link width
-
-     - 2
-
-     - * 0: Force xGMI link width to x2
-       * 1: Force xGMI link width to x8
-       * 2: Force xGMI link width to x16
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - xGMI max speed
-
-     - Auto
-
-     - Auto results in the speed being set to the lower of the max speed the
-       motherboard is designed to support and the max speed of the CPU in use.
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - APBDIS
-
-     - 1
-
-     - Disable DF (data fabric) P-states
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - DF C-states
-
-     - Auto
-
-     - 
-
-   * - AMD CBS / NBIO common options / SMU common options
-
-     - Fixed SOC P-state
-
-     - P0
-
-     - 
-
-   * - AMD CBS / security
-
-     - TSME
-
-     - Disabled
-
-     - Memory encryption
-
-.. _mi300x-grub-settings:
-
-GRUB settings
-------------
-
-In any modern Linux distribution, the ``/etc/default/grub`` file is used to
-configure GRUB. In this file, the string assigned to ``GRUB_CMDLINE_LINUX`` is
-the command line parameters that Linux uses during boot.
-
-Appending strings via Linux command line
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-It is recommended to append the following strings in ``GRUB_CMDLINE_LINUX``.
-
-``pci=realloc=off``
-  With this setting Linux is able to unambiguously detect all GPUs of the
-  MI300X-based system because this setting disables the automatic reallocation
-  of PCI resources. It's used when Single Root I/O Virtualization (SR-IOV) Base
-  Address Registers (BARs) have not been allocated by the BIOS. This can help
-  avoid potential issues with certain hardware configurations.
-
-``iommu=pt``
-  The ``iommu=pt`` setting enables IOMMU pass-through mode. When in pass-through
-  mode, the adapter does not need to use DMA translation to the memory, which can
-  improve performance.
-
-IOMMU is a system specific IO mapping mechanism and can be used for DMA mapping
-and isolation. This can be beneficial for virtualization and device assignment
-to virtual machines. It is recommended to enable IOMMU support.
-
-For a system that has AMD host CPUs add this to ``GRUB_CMDLINE_LINUX``:
-
-.. code-block:: text
-
-   iommu=pt
-
-Otherwise, if the system has Intel host CPUs add this instead to
-``GRUB_CMDLINE_LINUX``:
-
-.. code-block:: text
-
-   intel_iommu=on iommu=pt
-
-Update GRUB
-----------
-
-Update GRUB to use the modified configuration:
-
-.. code-block:: shell
-
-   sudo grub2-mkconfig -o /boot/grub2/grub.cfg
-
-On some Debian systems, the ``grub2-mkconfig`` command may not be available. Instead,
-check for the presence of ``grub-mkconfig``. Additionally, verify that you have the
-correct version by using the following command:
-
-.. code-block:: shell
-
-   grub-mkconfig -version
-
-.. _mi300x-os-settings:
-
-Operating system settings
-------------------------
-
-CPU core states (C-states)
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-There are several core states (C-states) that an AMD EPYC CPU can idle within:
-
-* **C0**: active. This is the active state while running an application.
-
-* **C1**: idle. This state consumes less power compared to C0, but can quickly
-  return to the active state (C0) with minimal latency.
-
-* **C2**: idle and power-gated. This is a deeper sleep state and will have greater
-  latency when moving back to the active (C0) state as compared to when the CPU
-  is coming out of C1.
-
-Disabling C2 is important for running with a high performance, low-latency
-network. To disable the C2 state, install the ``cpupower`` tool using your Linux
-distribution's package manager. ``cpupower`` is not a base package in most Linux
-distributions. The specific package to be installed varies per Linux
-distribution.
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu
-      :sync: ubuntu
-
-      .. code-block:: shell
-
-         sudo apt install linux-tools-common
-
-   .. tab-item:: RHEL
-      :sync: rhel
-
-      .. code-block:: shell
-
-         sudo yum install cpupowerutils
-
-   .. tab-item:: SLES
-      :sync: sles
-
-      .. code-block:: shell
-
-         sudo zypper install cpupower
-
-Now, to disable power-gating on all cores run the following on Linux
-systems, run the following command.
-
-.. code-block:: shell
-
-   cpupower idle-set -d 2
-
-`/proc` and `/sys` file system settings
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. _mi300x-disable-numa:
-
-Disable NUMA auto-balancing
-'''''''''''''''''''''''''''
-
-The NUMA balancing feature allows the OS to scan memory and attempt to migrate
-to a DIMM that is logically closer to the cores accessing it. This causes an
-overhead because the OS is second-guessing your NUMA allocations but may be
-useful if the NUMA locality access is very poor. Applications can therefore, in
-general, benefit from disabling NUMA balancing; however, there are workloads where
-doing so is detrimental to performance. Test this setting
-by toggling the ``numa_balancing`` value and running the application; compare
-the performance of one run with this set to ``0`` and another run with this to
-``1``.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the current
-NUMA (Non-Uniform Memory Access) settings. Output ``0`` indicates this
-setting is disabled. If no output or output is ``1``, run the command
-``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to disable it.
-
-For these settings, the ``env_check.sh`` script automates setting, resetting,
-and checking your environments. Find the script at
-`<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
-
-Run the script as follows to set or reset the settings:
-
-``./env_check.sh [set/reset/check]``
-
-.. tip::
-
-   Use ``./env_check.sh -h`` for help info.
-
-Automate disabling NUMA auto-balance using Cron
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The :ref:`mi300x-disable-numa` section describes the command to disable NUMA
-auto-balance. To automate the command with Cron, edit the ``crontab``
-configuration file for the root user:
-
-.. code-block:: shell
-
-   sudo crontab -e
-
-#. Add the following Cron entry to run the script at a specific interval:
-
-   .. code-block:: shell
-
-      @reboot sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-#. Save the file and exit the text editor.
-
-#. Optionally, restart the system to apply changes by issuing ``sudo reboot``.
-
-#. Verify your new configuration.
-
-   .. code-block::
-
-      cat /proc/sys/kernel/numa_balancing
-
-   The ``/proc/sys/kernel/numa_balancing`` file controls NUMA balancing in the
-   Linux kernel. If the value in this file is set to ``0``, the NUMA balancing
-   is disabled. If the value is set to ``1``, NUMA balancing is enabled.
-
-.. note::
-
-   Disabling NUMA balancing should be done cautiously and for
-   specific reasons, such as performance optimization or addressing
-   particular issues. Always test the impact of disabling NUMA balancing in
-   a controlled environment before applying changes to a production system.
-
-.. _mi300x-env-vars:
-
-Environment variables
-^^^^^^^^^^^^^^^^^^^^^
-
-HIP provides an environment variable export ``HIP_FORCE_DEV_KERNARG=1`` that
-can put arguments of HIP kernels directly to device memory to reduce the
-latency of accessing those kernel arguments. It can improve performance by 2 to
-3 µs for some kernels.
-
-It is recommended to set the following environment variable:
-
-.. code-block:: shell
-
-   export HIP_FORCE_DEV_KERNARG=1
-
-.. note::
-
-   This is the default option as of ROCm 6.2.
-
-Change affinity of ROCm helper threads
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This change prevents internal ROCm threads from having their CPU core affinity mask 
-set to all CPU cores available. With this setting, the threads inherit their parent's 
-CPU core affinity mask. If you have any questions regarding this setting, 
-contact your MI300X platform vendor. To enable this setting, enter the following command:
-
-.. code-block:: shell
-
-   export HSA_OVERRIDE_CPU_AFFINITY_DEBUG=0 
-
-IOMMU configuration -- systems with 256 CPU threads
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-For systems that have 256 logical CPU cores or more, setting the input-output
-memory management unit (IOMMU) configuration to ``disabled`` can limit the
-number of available logical cores to 255. The reason is that the Linux kernel
-disables X2APIC in this case and falls back to Advanced Programmable Interrupt
-Controller (APIC), which can only enumerate a maximum of 255 (logical) cores.
-
-If SMT is enabled by setting ``CCD/Core/Thread Enablement > SMT Control`` to
-``enable``, you can apply the following steps to the system to enable all
-(logical) cores of the system:
-
-#. In the server BIOS, set IOMMU to ``Enabled``.
-
-#. When configuring the GRUB boot loader, add the following argument for the Linux kernel: ``iommu=pt``.
-
-#. Update GRUB.
-
-#. Reboot the system.
-
-#. Verify IOMMU passthrough mode by inspecting the kernel log via ``dmesg``:
-
-   .. code-block::
-
-      dmesg | grep iommu
-
-.. code-block:: shell
-
-   [...]
-   [   0.000000] Kernel command line: [...] iommu=pt
-   [...]
-
-Once the system is properly configured, ROCm software can be
-:doc:`installed <rocm-install-on-linux:index>`.
-
-.. _mi300x-system-management:
-
-System management
-=================
-
-To optimize system performance, it's essential to first understand the existing
-system configuration parameters and settings. ROCm offers several CLI tools that
-can provide system-level information, offering valuable insights for
-optimizing user applications.
-
-For a complete guide on how to install, manage, or uninstall ROCm on Linux, refer to
-:doc:`rocm-install-on-linux:install/quick-start`. For verifying that the
-installation was successful, refer to the
-:doc:`rocm-install-on-linux:install/post-install`.
-Should verification fail, consult :doc:`/how-to/system-debugging`.
-
-.. _mi300x-hardware-verification-with-rocm:
-
-Hardware verification with ROCm
-------------------------------
-
-The ROCm platform provides tools to query the system structure. These include
-:ref:`ROCm SMI <mi300x-rocm-smi>` and :ref:`ROCm Bandwidth Test <mi300x-bandwidth-test>`.
-
-.. _mi300x-rocm-smi:
-
-ROCm SMI
-^^^^^^^^
-
-To query your GPU hardware, use the ``rocm-smi`` command. ROCm SMI lists
-GPUs available to your system -- with their device ID and their respective
-firmware (or VBIOS) versions.
-
-The following screenshot shows that all 8 GPUs of MI300X are recognized by ROCm.
-Performance of an application could be otherwise suboptimal if, for example, out
-of the 8 GPUs only 5 of them are recognized.
-
-.. image:: ../../data/how-to/tuning-guides/rocm-smi-showhw.png
-   :align: center
-   :alt: ``rocm-smi --showhw`` output
-
-To see the system structure, the localization of the GPUs in the system, and the
-fabric connections between the system components, use the command
-``rocm-smi --showtopo``.
-
-.. image:: ../../data/how-to/tuning-guides/rocm-smi-showtopo.png
-   :align: center
-   :alt: ``rocm-smi --showtopo`` output
-
-The first block of the output shows the distance between the GPUs similar to
-what the ``numactl`` command outputs for the NUMA domains of a system. The
-weight is a qualitative measure for the “distance” data must travel to reach one
-GPU from another one. While the values do not carry a special, or "physical"
-meaning, the higher the value the more hops are needed to reach the destination
-from the source GPU. This information has performance implication for a
-GPU-based application that moves data among GPUs. You can choose a minimum
-distance among GPUs to be used to make the application more performant.
-
-The second block has a matrix named *Hops between two GPUs*, where:
-
-* ``1`` means the two GPUs are directly connected with xGMI,
-
-* ``2`` means both GPUs are linked to the same CPU socket and GPU communications
-  will go through the CPU, and
-
-* ``3`` means both GPUs are linked to different CPU sockets so communications will
-  go through both CPU sockets. This number is one for all GPUs in this case
-  since they are all connected to each other through the Infinity Fabric links.
-
-The third block outputs the link types between the GPUs. This can either be
-``XGMI`` for AMD Infinity Fabric links or ``PCIE`` for PCIe Gen5 links.
-
-The fourth block reveals the localization of a GPU with respect to the NUMA
-organization of the shared memory of the AMD EPYC processors.
-
-To query the compute capabilities of the GPU devices, use rocminfo command. It
-lists specific details about the GPU devices, including but not limited to the
-number of compute units, width of the SIMD pipelines, memory information, and
-instruction set architecture (ISA). The following is the truncated output of the
-command:
-
-.. image:: ../../data/how-to/tuning-guides/rocminfo.png
-   :align: center
-   :alt: rocminfo.txt example
-
-For a complete list of architecture (such as CDNA3) and LLVM target names
-(such gfx942 for MI300X), refer to the
-:doc:`Supported GPUs section of the System requirements for Linux page <rocm-install-on-linux:reference/system-requirements>`.
-
-
-Deterministic clock
-'''''''''''''''''''
-
-Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock
-speed up to 1900 MHz instead of the default 2100 MHz. This can reduce
-the chance of a PCC event lowering the attainable GPU clocks. This
-setting will not be required for new IFWI releases with the production
-PRC feature. Restore this setting to its default value with the
-``rocm-smi -r`` command.
-
-.. _mi300x-bandwidth-test:
-
-ROCm Bandwidth Test
-^^^^^^^^^^^^^^^^^^^
-
-The section Hardware verification with ROCm showed how the command
-``rocm-smi --showtopo`` can be used to view the system structure and how the
-GPUs are connected. For more details on the link bandwidth,
-``rocm-bandwidth-test`` can run benchmarks to show the effective link bandwidth
-between the components of the system.
-
-You can install ROCm Bandwidth Test, which can test inter-device bandwidth,
-using the following package manager commands:
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu
-      :sync: ubuntu
-
-      .. code-block:: shell
-
-         sudo apt install rocm-bandwidth-test
-
-   .. tab-item:: RHEL
-      :sync: rhel
-
-      .. code-block:: shell
-
-         sudo yum install rocm-bandwidth-test
-
-   .. tab-item:: SLES
-      :sync: sles
-
-      .. code-block:: shell
-
-         sudo zypper install rocm-bandwidth-test
-
-Alternatively, you can download the source code from
-`<https://github.com/ROCm/rocm_bandwidth_test>`__ and build from source.
-
-The output will list the available compute devices (CPUs and GPUs), including
-their device ID and PCIe ID. The following screenshot is an example of the
-beginning part of the output of running ``rocm-bandwidth-test``. It shows the
-devices present in the system.
-
-.. image:: ../../data/how-to/tuning-guides/rocm-bandwidth-test.png
-   :align: center
-   :alt: rocm-bandwidth-test sample output
-
-The output will also show a matrix that contains a ``1`` if a device can
-communicate to another device (CPU and GPU) of the system and it will show the
-NUMA distance -- similar to ``rocm-smi``.
-
-Inter-device distance:
-
-.. figure:: ../../data/how-to/tuning-guides/rbt-inter-device-access.png
-   :align: center
-   :alt: rocm-bandwidth-test inter-device distance
-
-   Inter-device distance
-
-Inter-device NUMA distance:
-
-.. figure:: ../../data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
-   :align: center
-   :alt: rocm-bandwidth-test inter-device NUMA distance
-
-   Inter-device NUMA distance
-
-The output also contains the measured bandwidth for unidirectional and
-bidirectional transfers between the devices (CPU and GPU):
-
-Unidirectional bandwidth:
-
-.. figure:: ../../data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
-   :align: center
-   :alt: rocm-bandwidth-test unidirectional bandwidth
-
-   Unidirectional bandwidth
-
-Bidirectional bandwidth
-
-.. figure:: ../../data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
-   :align: center
-   :alt: rocm-bandwidth-test bidirectional bandwidth
-
-   Bidirectional bandwidth
-
-Abbreviations
-=============
-
-AMI
-  American Megatrends International
-
-APBDIS
-  Algorithmic Performance Boost Disable
-
-ATS
-  Address Translation Services
-
-BAR
-  Base Address Register
-
-BIOS
-  Basic Input/Output System
-
-CBS
-  Common BIOS Settings
-
-CLI
-  Command Line Interface
-
-CPU
-  Central Processing Unit
-
-cTDP
-  Configurable Thermal Design Power
-
-DDR5
-  Double Data Rate 5 DRAM
-
-DF
-  Data Fabric
-
-DIMM
-  Dual In-line Memory Module
-
-DMA
-  Direct Memory Access
-
-DPM
-  Dynamic Power Management
-
-GPU
-  Graphics Processing Unit
-
-GRUB
-  Grand Unified Bootloader
-
-HPC
-  High Performance Computing
-
-IOMMU
-  Input-Output Memory Management Unit
-
-ISA
-  Instruction Set Architecture
-
-LCLK
-  Link Clock Frequency
-
-NBIO
-  North Bridge Input/Output
-
-NUMA
-  Non-Uniform Memory Access
-
-PCC
-  Power Consumption Control
-
-PCI
-  Peripheral Component Interconnect
-
-PCIe
-  PCI Express
-
-POR
-  Power-On Reset
-
-SIMD
-  Single Instruction, Multiple Data
-
-SMT
-  Simultaneous Multi-threading
-
-SMI
-  System Management Interface
-
-SOC
-  System On Chip
-
-SR-IOV
-  Single Root I/O Virtualization
-
-TP
-  Tensor Parallelism
-
-TSME
-  Transparent Secure Memory Encryption
-
-X2APIC
-  Extended Advanced Programmable Interrupt Controller
-
-xGMI
-  Inter-chip Global Memory Interconnect 
--- a/docs/how-to/system-optimization/w6000-v620.md
+++ b/docs/how-to/system-optimization/w6000-v620.md
@@ -7,6 +7,37 @@ myst:

 # AMD RDNA2 system optimization

+## Workstation workloads
+
+Workstation workloads, much like those for HPC, have a unique set of
+requirements: a blend of both graphics and compute, certification, stability and
+others.
+
+The document covers specific software requirements and processes needed to use
+these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
+tasks.
+
+The main purpose of this document is to help users utilize the RDNA™ 2 GPUs to
+their full potential.
+
+```{list-table}
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - System Guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - [System settings](#system-settings)
+
+     - [AMD RDNA 2 instruction set architecture](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
+
+     - [RDNA 2 architecture](https://www.amd.com/content/dam/amd/en/documents/products/graphics/workstation/rdna2-explained-radeon-pro-W6000.pdf)
+
+```
+
 ## System settings

 This chapter reviews system settings that are required to configure the system
--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -13,7 +13,7 @@ application tuning suggestions to help you fully leverage the capabilities of
 these accelerators, thereby achieving optimal performance.

 * :doc:`../../rocm-for-ai/inference/vllm-benchmark`
-
-* :doc:`../../system-optimization/mi300x`
-
 * :doc:`../../rocm-for-ai/inference-optimization/workload`
+* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+
--- a/docs/index.md
+++ b/docs/index.md
@@ -53,13 +53,10 @@ ROCm documentation is organized into the following categories:
 :class-body: rocm-card-banner rocm-hue-8

 * [GPU architecture overview](./conceptual/gpu-arch.md)
-* [Input-Output Memory Management Unit (IOMMU)](./conceptual/iommu.rst)
 * [File structure (Linux FHS)](./conceptual/file-reorg.md)
 * [GPU isolation techniques](./conceptual/gpu-isolation.md)
 * [Using CMake](./conceptual/cmake-packages.rst)
-* [PCIe atomics in ROCm](./conceptual/pcie-atomics.rst)
 * [Inception v3 with PyTorch](./conceptual/ai-pytorch-inception.md)
-* [Oversubscription of hardware resources](./conceptual/oversubscription.rst)
 :::

 :::{grid-item-card} Reference
--- a/docs/reference/precision-support.rst
+++ b/docs/reference/precision-support.rst
@@ -9,16 +9,14 @@
 Data types and precision support
 *************************************************************

-This topic lists the supported data types of AMD GPUs and ROCm libraries.
-Corresponding :doc:`HIP <hip:index>` data types are also noted. 
+This topic lists the data types support on AMD GPUs, ROCm libraries along
+with corresponding :doc:`HIP <hip:index>` data types.

 Integral types
-==========================================
+==============

 The signed and unsigned integral types supported by ROCm are listed in
-the following table, along with their corresponding HIP type and a short
-description.
-
+the following table.

 .. list-table::
    :header-rows: 1
@@ -48,10 +46,9 @@ description.
 .. _precision_support_floating_point_types:

 Floating-point types
-==========================================
+====================

-The floating-point types supported by ROCm are listed in the following
-table, along with their corresponding HIP type and a short description.
+The floating-point types supported by ROCm are listed in the following table.

 .. image:: ../data/about/compatibility/floating-point-data-types.png
    :alt: Supported floating-point types
@@ -66,18 +63,18 @@ table, along with their corresponding HIP type and a short description.
      - Description
    *
      - float8 (E4M3)
-      - ``-``
+      - ``__hip_fp8_e4m3_fnuz``
      - An 8-bit floating-point number that mostly follows IEEE-754 conventions
-        and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ ,
-        with expanded range and no infinity or signed zero. NaN is
-        represented as negative zero.
+        and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
+        with expanded range and no infinity or signed zero. NaN is represented
+        as negative zero.
    *
      - float8 (E5M2)
-      - ``-``
+      - ``__hip_fp8_e5m2_fnuz``
      - An 8-bit floating-point number mostly following IEEE-754 conventions and
-        **S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ ,
-        with expanded range and no infinity or signed zero. NaN is
-        represented as negative zero.
+        **S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_,
+        with expanded range and no infinity or signed zero. NaN is represented
+        as negative zero.
    *
      - float16
      - ``half``
@@ -90,7 +87,7 @@ table, along with their corresponding HIP type and a short description.
        format.
    *
      - tensorfloat32
-      - ``-``
+      - Not available
      - A floating-point number that occupies 32 bits or less of storage,
        providing improved range compared to half (16-bit) format, at
        (potentially) greater throughput than single-precision (32-bit) formats.
@@ -117,12 +114,15 @@ table, along with their corresponding HIP type and a short description.

  * In some AMD documents and articles, float8 (E5M2) is referred to as bfloat8.

-ROCm support icons
-==========================================
+  * The :doc:`low precision floating point types page <hip:reference/low_fp_types>`
+    describes how to use these types in HIP with examples.

-In the following sections, icons represent the level of support. These
-icons, described in the following table, are also used in the library data type
-support pages.
+Level of support definitions
+============================
+
+In the following sections, icons represent the level of support. These icons,
+described in the following table, are also used in the library data type support
+pages.

 .. list-table::
    :header-rows: 1
@@ -130,6 +130,11 @@ support pages.
    *
      -  Icon
      - Definition
+
+    *
+      - NA
+      - Not applicable
+
    *
      - ❌
      - Not supported
@@ -158,16 +163,15 @@ support pages.
  * Any type can be emulated by software, but this page does not cover such
    cases.

-Hardware data type support
+Data type support by Hardware Architecture
 ==========================================

-The following tables provide information about AMD Instinct accelerators support
-for various data types. The MI200 series GPUs, which include MI210, MI250, and
-MI250X, are based on the CDNA2 architecture. The MI300 series GPUs, consisting
-of MI300A, MI300X, and MI325X, are built on the CDNA3 architecture.
+The MI200 series GPUs, which include MI210, MI250, and MI250X, are based on the
+CDNA2 architecture. The MI300 series GPUs, consisting of MI300A, MI300X, and
+MI325X, are based on the CDNA3 architecture.

 Compute units support
-------------------------------------------------------------------------------
+---------------------

 The following table lists data type support for compute units.

@@ -248,7 +252,7 @@ The following table lists data type support for compute units.
        - ✅

 Matrix core support
-------------------------------------------------------------------------------
+-------------------

 The following table lists data type support for AMD GPU matrix cores.

@@ -329,7 +333,7 @@ The following table lists data type support for AMD GPU matrix cores.
        - ✅

 Atomic operations support
-------------------------------------------------------------------------------
+-------------------------

 The following table lists data type support for atomic operations.

@@ -416,14 +420,14 @@ The following table lists data type support for atomic operations.
  performance impact when they frequently access the same memory address.

 Data type support in ROCm libraries
-==========================================
+===================================

 ROCm library support for int8, float8 (E4M3), float8 (E5M2), int16, float16,
 bfloat16, int32, tensorfloat32, float32, int64, and float64 is listed in the
 following tables.

 Libraries input/output type support
-------------------------------------------------------------------------------
+-----------------------------------

 The following tables list ROCm library support for specific input and output
 data types. Refer to the corresponding library data type support page for a
@@ -444,37 +448,37 @@ detailed description.
        - int32
        - int64
      *
-        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
        - ✅/✅
        - ❌/❌
        - ❌/❌
        - ❌/❌
      *
-        - rocRAND (:doc:`details <rocrand:api-reference/data-type-support>`)
-        - -/✅
-        - -/✅
-        - -/✅
-        - -/✅
+        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
+        - NA/✅
+        - NA/✅
+        - NA/✅
+        - NA/✅
      *
-        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
-        - -/✅
-        - -/✅
-        - -/✅
-        - -/✅
+        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
+        - NA/✅
+        - NA/✅
+        - NA/✅
+        - NA/✅
      *
-        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
+        -  :doc:`rocPRIM <rocprim:reference/data-type-support>`
        - ✅/✅
        - ✅/✅
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
+        - :doc:`hipCUB <hipcub:api-reference/data-type-support>`
        - ✅/✅
        - ✅/✅
        - ✅/✅
        - ✅/✅
      *
-        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
+        - :doc:`rocThrust <rocthrust:data-type-support>`
        - ✅/✅
        - ✅/✅
        - ✅/✅
@@ -496,7 +500,7 @@ detailed description.
        - float32
        - float64
      *
-        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -505,25 +509,25 @@ detailed description.
        - ❌/❌
        - ❌/❌
      *
-        - rocRAND (:doc:`details <rocrand:api-reference/data-type-support>`)
-        - -/❌
-        - -/❌
-        - -/✅
-        - -/❌
-        - -/❌
-        - -/✅
-        - -/✅
+        - :doc:`rocRAND <rocrand:api-reference/data-type-support>`
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/✅
      *
-        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
-        - -/❌
-        - -/❌
-        - -/✅
-        - -/❌
-        - -/❌
-        - -/✅
-        - -/✅
+        - :doc:`hipRAND <hiprand:api-reference/data-type-support>`
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/❌
+        - NA/❌
+        - NA/✅
+        - NA/✅
      *
-        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
+        - :doc:`rocPRIM <rocprim:reference/data-type-support>`
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -532,7 +536,7 @@ detailed description.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
+        - :doc:`hipCUB <hipcub:api-reference/data-type-support>`
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -541,7 +545,7 @@ detailed description.
        - ✅/✅
        - ✅/✅
      *
-        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
+        - :doc:`rocThrust <rocthrust:data-type-support>`
        - ❌/❌
        - ❌/❌
        - ⚠️/⚠️
@@ -550,9 +554,14 @@ detailed description.
        - ✅/✅
        - ✅/✅

+.. note::
+   
+  As random number generation libraries, rocRAND and hipRAND only specify output
+  data types for the random values they generate, with no need for input data
+  types.

 Libraries internal calculations type support
-------------------------------------------------------------------------------
+--------------------------------------------

 The following tables list ROCm library support for specific internal data types.
 Refer to the corresponding library data type support page for a detailed
@@ -573,7 +582,7 @@ description.
        - int32
        - int64
      *
-        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
        - ❌
        - ❌
        - ✅
@@ -596,7 +605,7 @@ description.
        - float32
        - float64
      *
-        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - :doc:`hipSPARSELt <hipsparselt:reference/data-type-support>`
        - ❌
        - ❌
        - ❌
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -18,6 +18,7 @@
 | [6.2.2](https://rocm.docs.amd.com/en/docs-6.2.2/) | September 27, 2024 |
 | [6.2.1](https://rocm.docs.amd.com/en/docs-6.2.1/) | September 20, 2024 |
 | [6.2.0](https://rocm.docs.amd.com/en/docs-6.2.0/) | August 2, 2024 |
+| [6.1.5](https://rocm.docs.amd.com/en/docs-6.1.5/) | March 13, 2025 |
 | [6.1.2](https://rocm.docs.amd.com/en/docs-6.1.2/) | June 4, 2024 |
 | [6.1.1](https://rocm.docs.amd.com/en/docs-6.1.1/) | May 8, 2024 |
 | [6.1.0](https://rocm.docs.amd.com/en/docs-6.1.0/) | Apr 16, 2024 |
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -44,6 +44,8 @@ subtrees:
            title: Train a model with Megatron-LM
          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training
            title: Train a model with PyTorch
+          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext
+            title: Train a model with JAX MaxText
          - file: how-to/rocm-for-ai/training/scale-model-training.rst
            title: Scale model training

@@ -73,7 +75,7 @@ subtrees:
          - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
            title: LLM inference frameworks
          - file: how-to/rocm-for-ai/inference/vllm-benchmark.rst
-            title: Performance validation
+            title: Performance testing
          - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
            title: Deploy your model

@@ -90,7 +92,7 @@ subtrees:
          - file: how-to/rocm-for-ai/inference-optimization/profiling-and-debugging.rst
            title: Profile and debug
          - file: how-to/rocm-for-ai/inference-optimization/workload.rst
-            title: Workload tuning
+            title: Workload optimization

      - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
        title: AI tutorials
@@ -99,20 +101,8 @@ subtrees:
    title: Use ROCm for HPC
  - file: how-to/system-optimization/index.rst
    title: System optimization
-    subtrees:
-    - entries:
-      - file: how-to/system-optimization/mi300x.rst
-        title: AMD Instinct MI300X
-      - file: how-to/system-optimization/mi300a.rst
-        title: AMD Instinct MI300A
-      - file: how-to/system-optimization/mi200.md
-        title: AMD Instinct MI200
-      - file: how-to/system-optimization/mi100.md
-        title: AMD Instinct MI100
-      - file: how-to/system-optimization/w6000-v620.md
-        title: AMD RDNA 2
-  - file: how-to/tuning-guides/mi300x/index.rst
-    title: AMD MI300X performance validation and tuning
+  - file: how-to/gpu-performance/mi300x.rst
+    title: AMD Instinct MI300X performance guides
  - file: how-to/system-debugging.md
  - file: conceptual/compiler-topics.md
    title: Use advanced compiler features
@@ -164,20 +154,14 @@ subtrees:
            title: AMD Instinct MI100/CDNA1 ISA
          - url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
            title: White paper
-  - file: conceptual/iommu.rst
-    title: Input-Output Memory Management Unit (IOMMU)
  - file: conceptual/file-reorg.md
    title: File structure (Linux FHS)
  - file: conceptual/gpu-isolation.md
    title: GPU isolation techniques
  - file: conceptual/cmake-packages.rst
    title: Using CMake
-  - file: conceptual/pcie-atomics.rst
-    title: PCIe atomics in ROCm
  - file: conceptual/ai-pytorch-inception.md
    title: Inception v3 with PyTorch
-  - file: conceptual/oversubscription.rst
-    title: Oversubscription of hardware resources

 - caption: Reference
  entries:
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.17.0
+rocm-docs-core==1.18.2
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile docs/sphinx/requirements.in
+#    pip-compile requirements.in
 #
 accessible-pygments==0.0.5
    # via pydata-sphinx-theme
@@ -10,46 +10,45 @@ alabaster==1.0.0
    # via sphinx
 asttokens==3.0.0
    # via stack-data
-attrs==25.1.0
+attrs==25.3.0
    # via
    #   jsonschema
    #   jupyter-cache
    #   referencing
-babel==2.16.0
+babel==2.17.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.3
    # via pydata-sphinx-theme
-breathe==4.35.0
+breathe==4.36.0
    # via rocm-docs-core
-certifi==2024.8.30
+certifi==2025.1.31
    # via requests
 cffi==1.17.1
    # via
    #   cryptography
    #   pynacl
-charset-normalizer==3.4.0
+charset-normalizer==3.4.1
    # via requests
-click==8.1.7
+click==8.1.8
    # via
    #   jupyter-cache
    #   sphinx-external-toc
 comm==0.2.2
    # via ipykernel
-cryptography==44.0.1
+cryptography==44.0.2
    # via pyjwt
-debugpy==1.8.12
+debugpy==1.8.13
    # via ipykernel
-decorator==5.1.1
+decorator==5.2.1
    # via ipython
 defusedxml==0.7.1
    # via sphinxcontrib-datatemplates
-deprecated==1.2.15
+deprecated==1.2.18
    # via pygithub
 docutils==0.21.2
    # via
-    #   breathe
    #   myst-parser
    #   pydata-sphinx-theme
    #   sphinx
@@ -57,13 +56,13 @@ exceptiongroup==1.2.2
    # via ipython
 executing==2.2.0
    # via stack-data
-fastjsonschema==2.20.0
+fastjsonschema==2.21.1
    # via
    #   nbformat
    #   rocm-docs-core
-gitdb==4.0.11
+gitdb==4.0.12
    # via gitpython
-gitpython==3.1.43
+gitpython==3.1.44
    # via rocm-docs-core
 greenlet==3.1.1
    # via sqlalchemy
@@ -77,13 +76,13 @@ importlib-metadata==8.6.1
    #   myst-nb
 ipykernel==6.29.5
    # via myst-nb
-ipython==8.31.0
+ipython==8.34.0
    # via
    #   ipykernel
    #   myst-nb
 jedi==0.19.2
    # via ipython
-jinja2==3.1.5
+jinja2==3.1.6
    # via
    #   myst-parser
    #   sphinx
@@ -117,9 +116,9 @@ mdit-py-plugins==0.4.2
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==1.1.2
+myst-nb==1.2.0
    # via rocm-docs-core
-myst-parser==4.0.0
+myst-parser==4.0.1
    # via myst-nb
 nbclient==0.10.2
    # via
@@ -135,16 +134,17 @@ nest-asyncio==1.6.0
 packaging==24.2
    # via
    #   ipykernel
+    #   pydata-sphinx-theme
    #   sphinx
 parso==0.8.4
    # via jedi
 pexpect==4.9.0
    # via ipython
-platformdirs==4.3.6
+platformdirs==4.3.7
    # via jupyter-core
 prompt-toolkit==3.0.50
    # via ipython
-psutil==6.1.1
+psutil==7.0.0
    # via ipykernel
 ptyprocess==0.7.0
    # via pexpect
@@ -152,19 +152,19 @@ pure-eval==0.2.3
    # via stack-data
 pycparser==2.22
    # via cffi
-pydata-sphinx-theme==0.16.0
+pydata-sphinx-theme==0.15.4
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.5.0
+pygithub==2.6.1
    # via rocm-docs-core
-pygments==2.18.0
+pygments==2.19.1
    # via
    #   accessible-pygments
    #   ipython
    #   pydata-sphinx-theme
    #   sphinx
-pyjwt[crypto]==2.10.0
+pyjwt[crypto]==2.10.1
    # via pygithub
 pynacl==1.5.0
    # via pygithub
@@ -178,7 +178,7 @@ pyyaml==6.0.2
    #   rocm-docs-core
    #   sphinx-external-toc
    #   sphinxcontrib-datatemplates
-pyzmq==26.2.0
+pyzmq==26.3.0
    # via
    #   ipykernel
    #   jupyter-client
@@ -190,15 +190,15 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.17.0
+rocm-docs-core==1.18.2
    # via -r requirements.in
-rpds-py==0.22.3
+rpds-py==0.24.0
    # via
    #   jsonschema
    #   referencing
 six==1.17.0
    # via python-dateutil
-smmap==5.0.1
+smmap==5.0.2
    # via gitdb
 snowballstemmer==2.2.0
    # via sphinx
@@ -220,7 +220,7 @@ sphinx==8.1.3
    #   sphinx-sitemap
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
-sphinx-book-theme==1.1.3
+sphinx-book-theme==1.1.4
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
    # via rocm-docs-core
@@ -228,16 +228,16 @@ sphinx-design==0.6.1
    # via rocm-docs-core
 sphinx-external-toc==1.0.1
    # via rocm-docs-core
-sphinx-notfound-page==1.0.4
+sphinx-notfound-page==1.1.0
    # via rocm-docs-core
-sphinx-reredirects==0.1.5
-    # via -r docs/sphinx/requirements.in
+sphinx-reredirects==0.1.6
+    # via -r requirements.in
 sphinx-sitemap==2.6.0
-    # via -r docs/sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
 sphinxcontrib-datatemplates==0.11.0
-    # via -r docs/sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx
 sphinxcontrib-htmlhelp==2.1.0
@@ -250,13 +250,13 @@ sphinxcontrib-runcmd==0.2.0
    # via sphinxcontrib-datatemplates
 sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-sqlalchemy==2.0.37
+sqlalchemy==2.0.40
    # via jupyter-cache
 stack-data==0.6.3
    # via ipython
 tabulate==0.9.0
    # via jupyter-cache
-tomli==2.1.0
+tomli==2.2.1
    # via sphinx
 tornado==6.4.2
    # via
@@ -272,21 +272,22 @@ traitlets==5.14.3
    #   matplotlib-inline
    #   nbclient
    #   nbformat
-typing-extensions==4.12.2
+typing-extensions==4.13.0
    # via
+    #   beautifulsoup4
    #   ipython
    #   myst-nb
    #   pydata-sphinx-theme
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.2.3
+urllib3==2.3.0
    # via
    #   pygithub
    #   requests
 wcwidth==0.2.13
    # via prompt-toolkit
-wrapt==1.17.0
+wrapt==1.17.2
    # via deprecated
 zipp==3.21.0
    # via importlib-metadata
--- a/tools/rocm-build/build_composable_kernel.sh
+++ b/tools/rocm-build/build_composable_kernel.sh
@@ -68,85 +68,6 @@ set_address_sanitizer_off() {
    export LDFLAGS=""
 }

-build_miopen_ckProf() {
-    ENABLE_ADDRESS_SANITIZER=false
-    echo "Start Building Composable Kernel Profiler"
-    if [ "${ENABLE_ADDRESS_SANITIZER}" == "true" ]; then
-       set_asan_env_vars
-       set_address_sanitizer_on
-    else
-       unset_asan_env_vars
-       set_address_sanitizer_off
-    fi
-
-    cd $COMPONENT_SRC
-    cd "$BUILD_DIR"
-    rm -rf *
-
-    architectures='gfx10 gfx11 gfx90 gfx94'
-    if [ -n "$GPU_ARCHS" ]; then
-        architectures=$(echo ${GPU_ARCHS} | awk -F';' '{for(i=1;i<=NF;i++) a[substr($i,1,5)]} END{for(i in a) printf i" "}')
-    fi
-
-    for arch in ${architectures}
-        do
-            if [ "${ASAN_CMAKE_PARAMS}" == "true" ] ; then
-                cmake -DBUILD_DEV=OFF \
-                    -DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}/lib/cmake;${ROCM_PATH%-*}/$ASAN_LIBDIR;${ROCM_PATH%-*}/llvm;${ROCM_PATH%-*}" \
-                    -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \
-                    -DCMAKE_SHARED_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_LIB_RPATH" \
-                    -DCMAKE_EXE_LINKER_FLAGS_INIT="-Wl,--enable-new-dtags,--rpath,$ROCM_ASAN_EXE_RPATH" \
-                    -DCMAKE_VERBOSE_MAKEFILE=1 \
-                    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
-                    -DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
-                    -DROCM_SYMLINK_LIBS=OFF \
-                    -DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DROCM_DISABLE_LDCONFIG=ON \
-                    -DROCM_PATH="${ROCM_PATH}" \
-                    -DCPACK_GENERATOR="${PKGTYPE^^}" \
-                    -DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
-                    -DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
-                    ${LAUNCHER_FLAGS} \
-                    -DPROFILER_ONLY=ON \
-                    -DENABLE_ASAN_PACKAGING=true \
-                    -DGPU_ARCH="${arch}" \
-                    "$COMPONENT_SRC"
-            else
-                cmake -DBUILD_DEV=OFF \
-                    -DCMAKE_PREFIX_PATH="${ROCM_PATH%-*}" \
-                    -DCMAKE_BUILD_TYPE=Release \
-                    -DCMAKE_SHARED_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN' \
-                    -DCMAKE_EXE_LINKER_FLAGS_INIT='-Wl,--enable-new-dtags,--rpath,$ORIGIN/../lib' \
-                    -DCMAKE_VERBOSE_MAKEFILE=1 \
-                    -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
-                    -DCMAKE_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DCMAKE_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \
-                    -DROCM_SYMLINK_LIBS=OFF \
-                    -DCPACK_PACKAGING_INSTALL_PREFIX="${ROCM_PATH}" \
-                    -DROCM_DISABLE_LDCONFIG=ON \
-                    -DROCM_PATH="${ROCM_PATH}" \
-                    -DCPACK_GENERATOR="${PKGTYPE^^}" \
-                    -DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/clang++" \
-                    -DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/clang" \
-                    ${LAUNCHER_FLAGS} \
-                    -DPROFILER_ONLY=ON \
-                    -DGPU_ARCH="${arch}" \
-                    "$COMPONENT_SRC"
-            fi
-
-            cmake --build . -- -j${PROC} package
-            cp ./*ckprofiler*.${PKGTYPE} $PACKAGE_DIR
-            rm -rf *
-        done
-    rm -rf _CPack_Packages/ && find -name '*.o' -delete
-
-    echo "Finished building Composable Kernel"
-    show_build_cache_stats
-}
-
 clean_miopen_ck() {
    echo "Cleaning MIOpen-CK build directory: ${BUILD_DIR} ${PACKAGE_DIR}"
    rm -rf "$BUILD_DIR" "$PACKAGE_DIR"
--- a/tools/rocm-build/build_lightning.sh
+++ b/tools/rocm-build/build_lightning.sh
@@ -42,7 +42,6 @@ DEB_PATH="$(getDebPath $PROJ_NAME)"
 RPM_PATH="$(getRpmPath $PROJ_NAME)"
 INSTALL_PATH="${ROCM_INSTALL_PATH}/lib/llvm"
 LLVM_ROOT_LCL="${LLVM_ROOT}"
-ROCM_WHEEL_DIR="${BUILD_PATH}/_wheel"

 TARGET="all"
 MAKEOPTS="$DASH_JAY"
@@ -150,7 +149,6 @@ ENABLE_RUNTIMES="$ENABLE_RUNTIMES;libcxx;libcxxabi"
 BOOTSTRAPPING_BUILD_LIBCXX=1

 clean_lightning() {
-    rm -rf "$ROCM_WHEEL_DIR"
    rm -rf "$BUILD_PATH"
    rm -rf "$DEB_PATH"
    rm -rf "$RPM_PATH"
@@ -332,15 +330,6 @@ build_lightning() {
    echo "End Workaround for race condition"
    cmake --build . -- $MAKEOPTS

-    case "$DISTRO_ID" in
-    (rhel*|centos*)
-       RHEL_BUILD=1
-       ;;
-    (*)
-       RHEL_BUILD=0
-       ;;
-     esac
-
    if [ $SKIP_LIT_TESTS -eq 0 ]; then
        if [ $RHEL_BUILD -eq 1 ]; then
            cmake --build . -- $MAKEOPTS check-lld check-mlir
@@ -1158,9 +1147,4 @@ case $TARGET in
    (*) die "Invalid target $TARGET" ;;
 esac

-if [[ $WHEEL_PACKAGE == true ]]; then
-    echo "Wheel Package build started !!!!"
-    create_wheel_package
-fi
-
 echo "Operation complete"
--- a/tools/rocm-build/build_omniperf.sh
+++ b/tools/rocm-build/build_omniperf.sh
@@ -1,171 +0,0 @@
-#!/bin/bash
-
-source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
-
-printUsage() {
-    echo
-    echo "Usage: ${BASH_SOURCE##*/} [options ...]"
-    echo
-    echo "Options:"
-    echo "  -c,  --clean              Clean output and delete all intermediate work"
-    echo "  -s,  --static             Build static lib (.a).  build instead of dynamic/shared(.so) "
-    echo "  -p,  --package <type>     Specify packaging format"
-    echo "  -r,  --release            Make a release build instead of a debug build"
-    echo "  -a,  --address_sanitizer  Enable address sanitizer"
-    echo "  -o,  --outdir <pkg_type>  Print path of output directory containing packages of
-                                      type referred to by pkg_type"
-    echo "  -w,  --wheel              Creates python wheel package of omniperf.
-                                      It needs to be used along with -r option"
-    echo "  -h,  --help               Prints this help"
-    echo
-    echo "Possible values for <type>:"
-    echo "  deb -> Debian format (default)"
-    echo "  rpm -> RPM format"
-    echo
-
-    return 0
-}
-
-API_NAME="omniperf"
-PROJ_NAME="$API_NAME"
-LIB_NAME="lib${API_NAME}"
-TARGET="build"
-MAKETARGET="deb"
-PACKAGE_ROOT="$(getPackageRoot)"
-PACKAGE_LIB="$(getLibPath)"
-BUILD_DIR="$(getBuildPath $API_NAME)"
-PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
-PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
-ROCM_WHEEL_DIR="${BUILD_DIR}/_wheel"
-BUILD_TYPE="Debug"
-MAKE_OPTS="$DASH_JAY -C $BUILD_DIR"
-SHARED_LIBS="ON"
-CLEAN_OR_OUT=0;
-MAKETARGET="deb"
-PKGTYPE="deb"
-WHEEL_PACKAGE=false
-
-
-#parse the arguments
-VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,static,address_sanitizer,outdir:,package:,wheel -- "$@")
-eval set -- "$VALID_STR"
-
-while true ;
-do
-    case "$1" in
-        -h | --help)
-                printUsage ; exit 0;;
-        -c | --clean)
-                TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
-        -r | --release)
-                BUILD_TYPE="Release" ; shift ;;
-        -a | --address_sanitizer)
-                set_asan_env_vars
-                set_address_sanitizer_on ; shift ;;
-        -s | --static)
-                SHARED_LIBS="OFF" ; shift ;;
-        -o | --outdir)
-                TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
-        -p | --package)
-                MAKETARGET="$2" ; shift 2 ;;
-        -w | --wheel)
-                WHEEL_PACKAGE=true ; shift ;;
-        --)     shift; break;; # end delimiter
-        *)
-                echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
-    esac
-
-done
-
-RET_CONFLICT=1
-check_conflicting_options "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
-if [ $RET_CONFLICT -ge 30 ]; then
-   print_vars "$API_NAME" "$TARGET" "$BUILD_TYPE" "$SHARED_LIBS" "$CLEAN_OR_OUT" "$PKGTYPE" "$MAKETARGET"
-   exit $RET_CONFLICT
-fi
-
-clean() {
-    echo "Cleaning $PROJ_NAME"
-    rm -rf "$ROCM_WHEEL_DIR"
-    rm -rf "$BUILD_DIR"
-    rm -rf "$PACKAGE_DEB"
-    rm -rf "$PACKAGE_RPM"
-    rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
-    rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
-}
-
-build() {
-    echo "Building $PROJ_NAME"
-    if [ "$DISTRO_ID" = centos-7 ]; then
-        echo "Skip make and uploading packages for Omniperf on Centos7 distro, due to python dependency"
-        exit 0
-    fi
-
-    if [ ! -d "$BUILD_DIR" ]; then
-        mkdir -p "$BUILD_DIR"
-        pushd "$BUILD_DIR" || exit
-
-        echo "ROCm CMake Params: $(rocm_cmake_params)"
-        echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
-
-        print_lib_type $SHARED_LIBS
-        cmake \
-            $(rocm_cmake_params) \
-            $(rocm_common_cmake_params) \
-            -DCHECK_PYTHON_DEPS=NO \
-            -DPYTHON_DEPS=${BUILD_DIR}/python-libs \
-            -DMOD_INSTALL_PATH=${BUILD_DIR}/modulefiles \
-            "$OMNIPERF_ROOT"
-    fi
-
-    make $MAKE_OPTS
-    make $MAKE_OPTS install
-    make $MAKE_OPTS package
-
-    copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
-    copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
-}
-
-create_wheel_package() {
-    echo "Creating Omniperf wheel package"
-
-    # Copy the setup.py generator to build folder
-    mkdir -p "$ROCM_WHEEL_DIR"
-    cp -f "$SCRIPT_ROOT"/generate_setup_py.py "$ROCM_WHEEL_DIR"
-    cp -f "$SCRIPT_ROOT"/repackage_wheel.sh "$ROCM_WHEEL_DIR"
-    cd "$ROCM_WHEEL_DIR" || exit
-
-    # Currently only supports python3.6
-    ./repackage_wheel.sh "$BUILD_DIR"/*.rpm python3.6
-
-    # Copy the wheel created to RPM folder which will be uploaded to artifactory
-    copy_if WHL "WHL" "$PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
-}
-
-print_output_directory() {
-    case ${PKGTYPE} in
-        ("deb")
-            echo "${PACKAGE_DEB}";;
-        ("rpm")
-            echo "${PACKAGE_RPM}";;
-        (*)
-            echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
-    esac
-    exit
-}
-
-verifyEnvSetup
-
-case "$TARGET" in
-    (clean) clean ;;
-    (build) build ;;
-    (outdir) print_output_directory ;;
-    (*) die "Invalid target $TARGET" ;;
-esac
-
-if [[ $WHEEL_PACKAGE == true ]]; then
-    echo "Wheel Package build started !!!!"
-    create_wheel_package
-fi
-
-echo "Operation complete"
--- a/tools/rocm-build/build_omnitrace.sh
+++ b/tools/rocm-build/build_omnitrace.sh
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
-
-printUsage() {
-    echo
-    echo "Usage: ${BASH_SOURCE##*/} [options ...]"
-    echo
-    echo "Options:"
-    echo "  -c,  --clean              Clean output and delete all intermediate work"
-    echo "  -s,  --static             Build static lib (.a).  build instead of dynamic/shared(.so) "
-    echo "  -p,  --package <type>     Specify packaging format"
-    echo "  -r,  --release            Make a release build instead of a debug build"
-    echo "  -a,  --address_sanitizer  Enable address sanitizer"
-    echo "  -o,  --outdir <pkg_type>  Print path of output directory containing packages of
-                                      type referred to by pkg_type"
-    echo "  -w,  --wheel              Creates python wheel package of omnitrace.
-                                      It needs to be used along with -r option"
-    echo "  -h,  --help               Prints this help"
-    echo
-    echo "Possible values for <type>:"
-    echo "  deb -> Debian format (default)"
-    echo "  rpm -> RPM format"
-    echo
-
-    return 0
-}
-
-API_NAME="omnitrace"
-PROJ_NAME="$API_NAME"
-LIB_NAME="lib${API_NAME}"
-TARGET="build"
-MAKETARGET="deb"
-PACKAGE_ROOT="$(getPackageRoot)"
-PACKAGE_LIB="$(getLibPath)"
-BUILD_DIR="$(getBuildPath $API_NAME)"
-PACKAGE_DEB="$(getPackageRoot)/deb/$API_NAME"
-PACKAGE_RPM="$(getPackageRoot)/rpm/$API_NAME"
-BUILD_TYPE="Debug"
-MAKE_OPTS="-j 8"
-SHARED_LIBS="ON"
-CLEAN_OR_OUT=0
-MAKETARGET="deb"
-PKGTYPE="deb"
-ASAN=0
-
-#parse the arguments
-VALID_STR=$(getopt -o hcraso:p:w --long help,clean,release,address_sanitizer,static,outdir:,package:,wheel -- "$@")
-eval set -- "$VALID_STR"
-
-while true; do
-    case "$1" in
-    -h | --help)
-        printUsage
-        exit 0
-        ;;
-    -c | --clean)
-        TARGET="clean"
-        ((CLEAN_OR_OUT |= 1))
-        shift
-        ;;
-    -r | --release)
-        BUILD_TYPE="RelWithDebInfo"
-        shift
-        ;;
-    -a | --address_sanitizer)
-        ack_and_ignore_asan
-
-        ASAN=1
-        shift
-        ;;
-    -s | --static)
-        SHARED_LIBS="OFF"
-        shift
-        ;;
-    -o | --outdir)
-        TARGET="outdir"
-        PKGTYPE=$2
-        ((CLEAN_OR_OUT |= 2))
-        shift 2
-        ;;
-    -p | --package)
-        MAKETARGET="$2"
-        shift 2
-        ;;
-    -w | --wheel)
-	   echo "omnitrace: wheel build option accepted and ignored"
-       shift
-       ;;
-    --)
-        shift
-        break
-        ;;
-    *)
-        echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] " >&2
-        exit 20
-        ;;
-    esac
-
-done
-
-RET_CONFLICT=1
-check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
-if [ $RET_CONFLICT -ge 30 ]; then
-    print_vars $API_NAME $TARGET $BUILD_TYPE $SHARED_LIBS $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
-    exit $RET_CONFLICT
-fi
-
-clean() {
-    echo "Cleaning $PROJ_NAME"
-    rm -rf "$BUILD_DIR"
-    rm -rf "$PACKAGE_DEB"
-    rm -rf "$PACKAGE_RPM"
-    rm -rf "$PACKAGE_ROOT/${PROJ_NAME:?}"
-    rm -rf "$PACKAGE_LIB/${LIB_NAME:?}"*
-}
-
-build_omnitrace() {
-    echo "Building $PROJ_NAME"
-    if [ "$DISTRO_ID" = "mariner-2.0" ] || [ "$DISTRO_ID" = "ubuntu-24.04" ] || [ "$DISTRO_ID" = "azurelinux-3.0" ]; then
-        echo "Skip make and uploading packages for Omnitrace on \"${DISTRO_ID}\" distro"
-        exit 0
-    fi
-
-    if [ $ASAN == 1 ]; then
-        echo "Skip make and uploading packages for Omnitrace on ASAN build"
-        exit 0
-    fi
-    if [ ! -d "$BUILD_DIR" ]; then
-        mkdir -p "$BUILD_DIR"
-        echo "Created build directory: $BUILD_DIR"
-    fi
-
-    echo "Build directory: $BUILD_DIR"
-    pushd "$BUILD_DIR" || exit
-    print_lib_type $SHARED_LIBS
-
-    echo "ROCm CMake Params: $(rocm_cmake_params)"
-    echo "ROCm Common CMake Params: $(rocm_common_cmake_params)"
-
-
-    if [ $ASAN == 1 ]; then
-        echo "Address Sanitizer path"
-
-    else
-        cmake \
-            $(rocm_cmake_params) \
-            $(rocm_common_cmake_params) \
-            -DOMNITRACE_BUILD_{LIBUNWIND,DYNINST}=ON \
-            -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON \
-            "$OMNITRACE_ROOT"
-    fi
-
-
-    popd || exit
-
-    echo "Make Options: $MAKE_OPTS"
-    cmake --build "$BUILD_DIR" --target all -- $MAKE_OPTS
-    cmake --build "$BUILD_DIR" --target install -- $MAKE_OPTS
-    cmake --build "$BUILD_DIR" --target package -- $MAKE_OPTS
-
-    copy_if DEB "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_DEB" "$BUILD_DIR/${API_NAME}"*.deb
-    copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$PACKAGE_RPM" "$BUILD_DIR/${API_NAME}"*.rpm
-}
-
-print_output_directory() {
-    case ${PKGTYPE} in
-    "deb")
-        echo "${PACKAGE_DEB}"
-        ;;
-    "rpm")
-        echo "${PACKAGE_RPM}"
-        ;;
-    *)
-        echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2
-        exit 1
-        ;;
-    esac
-    exit
-}
-
-verifyEnvSetup
-
-case "$TARGET" in
-clean) clean ;;
-build) build_omnitrace ;;
-outdir) print_output_directory ;;
-*) die "Invalid target $TARGET" ;;
-esac
-
-echo "Operation complete"
--- a/tools/rocm-build/build_opencl_icd_loader.sh
+++ b/tools/rocm-build/build_opencl_icd_loader.sh
@@ -1,141 +0,0 @@
-#!/bin/bash
-
-source "$(dirname "${BASH_SOURCE}")/compute_utils.sh"
-PROJ_NAME=OpenCL-ICD-Loader
-TARGET="build"
-MAKEOPTS="$DASH_JAY"
-BUILD_TYPE="Debug"
-PACKAGE_ROOT="$(getPackageRoot)"
-PACKAGE_DEB="$PACKAGE_ROOT/deb/${PROJ_NAME,,}"
-PACKAGE_RPM="$PACKAGE_ROOT/rpm/${PROJ_NAME,,}"
-CLEAN_OR_OUT=0;
-PKGTYPE="deb"
-MAKETARGET="deb"
-API_NAME="rocm-opencl-icd-loader"
-
-printUsage() {
-    echo
-    echo "Usage: $(basename "${BASH_SOURCE}") [options ...]"
-    echo
-    echo "Options:"
-    echo "  -c,  --clean              Clean output and delete all intermediate work"
-    echo "  -p,  --package <type>     Specify packaging format"
-    echo "  -r,  --release            Make a release build instead of a debug build"
-    echo "  -h,  --help               Prints this help"
-    echo "  -o,  --outdir             Print path of output directory containing packages"
-    echo "  -s,  --static             Component/Build does not support static builds just accepting this param & ignore. No effect of the param on this build"
-    echo
-    echo "Possible values for <type>:"
-    echo "  deb -> Debian format (default)"
-    echo "  rpm -> RPM format"
-    echo
-    return 0
-}
-
-RET_CONFLICT=1
-check_conflicting_options $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
-if [ $RET_CONFLICT -ge 30 ]; then
-   print_vars $TARGET $BUILD_TYPE $CLEAN_OR_OUT $PKGTYPE $MAKETARGET
-   exit $RET_CONFLICT
-fi
-
-clean_opencl_icd_loader() {
-    echo "Cleaning $PROJ_NAME"
-    rm -rf "$PACKAGE_DEB"
-    rm -rf "$PACKAGE_RPM"
-    rm -rf "$PACKAGE_ROOT/${PROJ_NAME,,}"
-}
-
-copy_pkg_files_to_rocm() {
-    local comp_folder=$1
-    local comp_pkg_name=$2
-
-    cd "${OUT_DIR}/${PKGTYPE}/${comp_folder}"|| exit 2
-    if [ "${PKGTYPE}" = 'deb' ]; then
-        dpkg-deb -x ${comp_pkg_name}_*.deb pkg/
-    else
-        mkdir pkg && pushd pkg/ || exit 2
-        if [[ "${comp_pkg_name}" != *-dev* ]]; then
-            rpm2cpio ../${comp_pkg_name}-*.rpm | cpio -idmv
-        else
-            rpm2cpio ../${comp_pkg_name}el-*.rpm | cpio -idmv
-        fi
-        popd || exit 2
-    fi
-    ls ./pkg -alt
-    cp -r ./pkg/*/rocm*/* "${ROCM_PATH}" || exit 2
-    rm -rf pkg/
-}
-
-build_opencl_icd_loader() {
-    echo "Downloading $PROJ_NAME" package
-    if [ "$DISTRO_NAME" = ubuntu ]; then
-        mkdir -p "$PACKAGE_DEB"
-        local rocm_ver=${ROCM_VERSION}
-        if [ ${ROCM_VERSION##*.} = 0 ]; then
-            rocm_ver=${ROCM_VERSION%.*}
-        fi
-        local url="https://repo.radeon.com/rocm/apt/${rocm_ver}/pool/main/r/${API_NAME}/"
-        local package
-        package=$(curl -s "$url" | grep -Po 'href="\K[^"]*' | grep "${DISTRO_RELEASE}" | head -n 1)
-
-        if [ -z "$package" ]; then
-            echo "No package found for Ubuntu version $DISTRO_RELEASE"
-            exit 1
-        fi
-
-        wget -t3 -P "$PACKAGE_DEB" "${url}${package}"
-        copy_pkg_files_to_rocm ${PROJ_NAME,,} ${API_NAME}
-    else
-        echo "$DISTRO_ID is not supported..."
-        exit 2
-    fi
-
-    echo "Installing $PROJ_NAME" package
-}
-
-print_output_directory() {
-    case ${PKGTYPE} in
-        ("deb")
-            echo ${PACKAGE_DEB};;
-        ("rpm")
-            echo ${PACKAGE_RPM};;
-        (*)
-            echo "Invalid package type \"${PKGTYPE}\" provided for -o" >&2; exit 1;;
-    esac
-    exit
-}
-
-VALID_STR=`getopt -o hcraswlo:p: --long help,clean,release,outdir:,package: -- "$@"`
-eval set -- "$VALID_STR"
-while true ;
-do
-    case "$1" in
-        (-c  | --clean )
-            TARGET="clean" ; ((CLEAN_OR_OUT|=1)) ; shift ;;
-        (-r  | --release )
-            BUILD_TYPE="RelWithDebInfo" ; shift ;;
-        (-h  | --help )
-            printUsage ; exit 0 ;;
-        (-a  | --address_sanitizer)
-            ack_and_ignore_asan ; shift ;;
-        (-o  | --outdir)
-            TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
-        (-p | --package)
-            MAKETARGET="$2" ; shift 2;;
-	(-s | --static)
-            echo "-s parameter accepted but ignored" ; shift ;;
-        --)     shift; break;;
-        (*)
-            echo " This should never come but just incase : UNEXPECTED ERROR Parm : [$1] ">&2 ; exit 20;;
-    esac
-done
-
-case $TARGET in
-    (clean) clean_opencl_icd_loader ;;
-    (build) build_opencl_icd_loader ;;
-    (outdir) print_output_directory ;;
-    (*) die "Invalid target $TARGET" ;;
-esac
-
-echo "Operation complete"
--- a/tools/rocm-build/build_rocm-cmake.sh
+++ b/tools/rocm-build/build_rocm-cmake.sh
@@ -32,7 +32,6 @@ ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
 ROCM_CMAKE_BUILD_DIR="$(getBuildPath rocm-cmake)"
 ROCM_CMAKE_PACKAGE_DEB="$(getPackageRoot)/deb/rocm-cmake"
 ROCM_CMAKE_PACKAGE_RPM="$(getPackageRoot)/rpm/rocm-cmake"
-ROCM_WHEEL_DIR="${ROCM_CMAKE_BUILD_DIR}/_wheel"
 ROCM_CMAKE_BUILD_TYPE="debug"
 BUILD_TYPE="Debug"
 SHARED_LIBS="ON"
@@ -56,8 +55,6 @@ do
                ack_and_ignore_asan ; shift ;;
        (-s | --static)
                SHARED_LIBS="OFF" ; shift ;;
-        (-w | --wheel)
-            WHEEL_PACKAGE=true ; shift ;;
        (-o | --outdir)
                TARGET="outdir"; PKGTYPE=$2 ; OUT_DIR_SPECIFIED=1 ; ((CLEAN_OR_OUT|=2)) ; shift 2 ;;
        (-p | --package)
@@ -78,7 +75,6 @@ fi


 clean_rocm_cmake() {
-    rm -rf "$ROCM_WHEEL_DIR"
    rm -rf $ROCM_CMAKE_BUILD_DIR
    rm -rf $ROCM_CMAKE_PACKAGE_DEB
    rm -rf $ROCM_CMAKE_PACKAGE_RPM
@@ -106,19 +102,6 @@ build_rocm_cmake() {
    copy_if RPM "${CPACKGEN:-"DEB;RPM"}" "$ROCM_CMAKE_PACKAGE_RPM" $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm
 }

-create_wheel_package() {
-    echo "Creating rocm-cmake wheel package"
-    # Copy the setup.py generator to build folder
-    mkdir -p $ROCM_WHEEL_DIR
-    cp -f $SCRIPT_ROOT/generate_setup_py.py $ROCM_WHEEL_DIR
-    cp -f $SCRIPT_ROOT/repackage_wheel.sh $ROCM_WHEEL_DIR
-    cd $ROCM_WHEEL_DIR
-    # Currently only supports python3.6
-    ./repackage_wheel.sh $ROCM_CMAKE_BUILD_DIR/rocm-cmake*.rpm python3.6
-    # Copy the wheel created to RPM folder which will be uploaded to artifactory
-    copy_if WHL "WHL" "$ROCM_CMAKE_PACKAGE_RPM" "$ROCM_WHEEL_DIR"/dist/*.whl
-}
-
 print_output_directory() {
    case ${PKGTYPE} in
        ("deb")
@@ -138,9 +121,4 @@ case $TARGET in
    (*) die "Invalid target $TARGET" ;;
 esac

-if [[ $WHEEL_PACKAGE == true ]]; then
-    echo "Wheel Package build started !!!!"
-    create_wheel_package
-fi
-
 echo "Operation complete"
--- a/tools/rocm-build/docker/ubuntu20/packages
+++ b/tools/rocm-build/docker/ubuntu20/packages
@@ -7,7 +7,6 @@ bison
 bridge-utils
 build-essential
 bzip2
-ccache
 check
 chrpath
 cifs-utils
@@ -121,11 +120,9 @@ python3-yaml
 python3.8-dev
 re2c
 redis-tools
-# Eventually we should be able to remove rpm for debian builds.
 rpm
 rsync
 ssh
-# This makes life more pleasent inside the container
 strace
 sudo
 systemtap-sdt-dev
--- a/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
+++ b/tools/rocm-build/docker/ubuntu22/install-prerequisities.sh
@@ -1,285 +0,0 @@
-#! /usr/bin/bash
-
-set -x
-
-apt-get -y update 
-DEBIAN_FRONTEND=noninteractive DEBCONF_NONINTERACTIVE_SEEN=true apt-get install --no-install-recommends -y $(sed 's/#.*//' /tmp/packages)
-apt-get clean 
-rm -rf /var/cache/apt/ /var/lib/apt/lists/* /etc/apt/apt.conf.d/01proxy
-
-#Install  2.17.1 version of git as we are seeing issues with 2.25 , where it was not allowing to add git submodules if the user is different for parent git directory
-curl -o git.tar.gz https://cdn.kernel.org/pub/software/scm/git/git-2.17.1.tar.gz 
-tar -zxf git.tar.gz 
-cd git-* 
-make prefix=/usr/local all    
-make prefix=/usr/local install
-git --version
-
-#install argparse and CppHeaderParser python modules for roctracer and rocprofiler
-#install rocm-docs-core for the docs-as-code project. Only needed on one OS
-# CppHeader needs setuptools. setuptools needs wheel.
-# Looks like I need them as seperate commands
-# Sigh, install both python2 and python 3 version
-pip3 install --no-cache-dir setuptools wheel tox
-pip3 install --no-cache-dir CppHeaderParser argparse requests lxml barectf recommonmark jinja2==3.0.0 websockets matplotlib numpy scipy minimal msgpack pytest sphinx joblib PyYAML rocm-docs-core cmake==3.25.2 pandas myst-parser
-
-# Allow sudo for everyone user
-echo 'ALL ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/everyone
-
-# Install OCaml packages to build LLVM's OCaml bindings to be used in lightning compiler test pipeline
-wget -nv https://sourceforge.net/projects/opam.mirror/files/2.1.4/opam-2.1.4-x86_64-linux -O /usr/local/bin/opam 
-chmod +x /usr/local/bin/opam
-opam init --yes --disable-sandboxing
-opam install ctypes --yes
-
-# Install and modify git-repo (#!/usr/bin/env python -> #!/usr/bin/env python3)
-curl https://storage.googleapis.com/git-repo-downloads/repo > /usr/bin/repo
-chmod a+x /usr/bin/repo
-
-# Build ccache from the source
-cd /tmp 
-git clone https://github.com/ccache/ccache -b v4.7.5 
-cd ccache 
-mkdir build 
-cd build 
-cmake -DCMAKE_BUILD_TYPE=Release .. 
-make 
-make install 
-cd /tmp 
-rm -rf ccache
-
-# Install sharp from MLNX_OFED_LINUX as dependency for rccl-rdma-sharp-plugins
-cd /var/tmp
-mkdir mlnx 
-wget -O mlnx/tar.tgz https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.tgz 
-tar -xz -C mlnx -f mlnx/tar.tgz 
-apt-key add mlnx/*/RPM-GPG-KEY-Mellanox
-echo "deb [arch=amd64] file:$(echo $PWD/mlnx/*/DEBS) ./" > /etc/apt/sources.list.d/sharp.list
-apt update
-apt install -y sharp 
-apt clean
-rm -rf /var/cache/apt/ /var/lib/apt/lists/* mlnx /etc/apt/sources.list.d/sharp.list
-
-apt update
-apt -y install libunwind-dev
-apt -y install libgoogle-glog-dev
-
-# Install python3.8 from source
-curl -LO https://www.python.org/ftp/python/3.8.13/Python-3.8.13.tar.xz
-tar -xvf Python-3.8.13.tar.xz
-pwd
-ls /var/tmp/
-ls Python-3.8.13
-mv Python-3.8.13 /opt/
-apt install build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libsqlite3-dev libreadline-dev libffi-dev curl libbz2-dev pkg-config make -y 
-cd /opt/Python-3.8.13/ 
-./configure --enable-optimizations --enable-shared
-make 
-make -j 6 
-make altinstall 
-ldconfig /opt/Python3.8.13
-python3.8 --version
-
-# roctracer and rocprofiler needs this python3.8
-python3.8 -m pip install setuptools wheel
-python3.8 -m pip install CppHeaderParser argparse requests lxml PyYAML joblib
-
-#Install older version of hwloc-devel package for rocrtst
-curl -lO https://download.open-mpi.org/release/hwloc/v1.11/hwloc-1.11.13.tar.bz2
-tar -xvf hwloc-1.11.13.tar.bz2
-cd hwloc-1.11.13
-./configure
-make
-make install
-cp /usr/local/lib/libhwloc.so.5 /usr/lib
-hwloc-info --version
-
-# Install gtest
-mkdir -p /tmp/gtest
-cd /tmp/gtest
-wget https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip -O googletest.zip
-unzip googletest.zip
-cd googletest-1.14.0/ 
-mkdir build
-cd build
-cmake .. 
-make -j$(nproc) 
-make install
-rm -rf /tmp/gtest
-
-## Install gRPC from source
-## RDC Pre-requisites
-GRPC_ARCHIVE=grpc-1.61.0.tar.gz
-mkdir /tmp/grpc
-mkdir /usr/grpc 
-cd /tmp 
-git clone --recurse-submodules -b v1.61.0 https://github.com/grpc/grpc
-cd grpc
-mkdir -p build 
-cd build
-cmake  -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/grpc -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=14 -DCMAKE_SHARED_LINKER_FLAGS_INIT=-Wl,--enable-new-dtags,--build-id=sha1,--rpath,'$ORIGIN' .. 
-make -j $(nproc) install 
-rm -rf /tmp/grpc
-
-## rocBLAS Pre-requisites
-## Download prebuilt AMD multithreaded blis (2.0)
-## Reference : https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/install.sh#L403
-mkdir -p /tmp/blis 
-cd /tmp/blis
-wget -O - https://github.com/amd/blis/releases/download/2.0/aocl-blis-mt-ubuntu-2.0.tar.gz | tar xfz - 
-mv amd-blis-mt /usr/blis 
-cd / 
-rm -rf /tmp/blis
-
-## rocBLAS Pre-requisites(SWDEV-404612)
-## Download aocl-linux-gcc-4.2.0_1_amd64.deb
-mkdir -p /tmp/aocl 
-cd /tmp/aocl 
-wget -nv https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-gcc-4.2.0_1_amd64.deb 
-apt install ./aocl-linux-gcc-4.2.0_1_amd64.deb 
-rm -rf /tmp/aocl
-
-## hipBLAS Pre-requisites
-## lapack(3.9.1v)
-## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L174
-lapack_version=3.9.1
-lapack_srcdir=lapack-$lapack_version
-lapack_blddir=lapack-$lapack_version-bld
-mkdir -p /tmp/lapack
-cd /tmp/lapack
-rm -rf "$lapack_srcdir" "$lapack_blddir" 
-wget -O - https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v3.9.1.tar.gz | tar xzf - 
-cmake -H$lapack_srcdir -B$lapack_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls -DBUILD_TESTING=OFF -DCBLAS=ON -DLAPACKE=OFF
-make -j$(nproc) -C "$lapack_blddir"
-make -C "$lapack_blddir" install
-cd $lapack_blddir
-cp -r ./include/* /usr/local/include/ 
-cp -r ./lib/* /usr/local/lib 
-cd / 
-rm -rf /tmp/lapack
-
-## rocSOLVER Pre-requisites
-## FMT(7.1.3v)
-## Reference https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/install.sh#L152
-fmt_version=7.1.3
-fmt_srcdir=fmt-$fmt_version
-fmt_blddir=fmt-$fmt_version-bld
-mkdir -p /tmp/fmt
-cd /tmp/fmt
-rm -rf "$fmt_srcdir" "$fmt_blddir"
-wget -O - https://github.com/fmtlib/fmt/archive/refs/tags/7.1.3.tar.gz | tar xzf -
-cmake -H$fmt_srcdir -B$fmt_blddir -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=OFF -DCMAKE_CXX_STANDARD_REQUIRED=ON -DFMT_DOC=OFF -DFMT_TEST=OFF
-make -j$(nproc) -C "$fmt_blddir"
-make -C "$fmt_blddir" install
-
-# Build and install libjpeg-turbo
-mkdir -p /tmp/libjpeg-turbo 
-cd /tmp/libjpeg-turbo 
-wget -nv https://github.com/rrawther/libjpeg-turbo/archive/refs/heads/2.0.6.2.zip -O libjpeg-turbo-2.0.6.2.zip
-unzip libjpeg-turbo-2.0.6.2.zip 
-cd libjpeg-turbo-2.0.6.2
-mkdir build
-cd build
-cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib .. 
-make -j$(nproc) install 
-rm -rf /tmp/libjpeg-turbo
-
-# Get released ninja from source
-mkdir -p /tmp/ninja 
-cd /tmp/ninja 
-wget -nv https://codeload.github.com/Kitware/ninja/zip/refs/tags/v1.11.1.g95dee.kitware.jobserver-1 -O ninja.zip
-unzip ninja.zip
-cd ninja-1.11.1.g95dee.kitware.jobserver-1
-./configure.py --bootstrap 
-cp ninja /usr/local/bin/ 
-rm -rf /tmp/ninja
-
-# Install  FFmpeg and dependencies
-# Build NASM
-mkdir -p /tmp/nasm-2.15.05 
-cd /tmp
-wget -qO- "https://distfiles.macports.org/nasm/nasm-2.15.05.tar.bz2" | tar -xvj 
-cd nasm-2.15.05
-./autogen.sh
-./configure --prefix="/usr/local"
-make -j$(nproc) install
-rm -rf /tmp/nasm-2.15.05
-
-# Build YASM
-mkdir -p /tmp/yasm-1.3.0 
-cd /tmp
-wget -qO- "http://www.tortall.net/projects/yasm/releases/yasm-1.3.0.tar.gz" | tar -xvz 
-cd yasm-1.3.0
-./configure --prefix="/usr/local"
-make -j$(nproc) install
-rm -rf /tmp/yasm-1.3.0
-
-# Build x264
-mkdir -p /tmp/x264-snapshot-20191217-2245-stable
-cd /tmp
-wget -qO-  "https://download.videolan.org/pub/videolan/x264/snapshots/x264-snapshot-20191217-2245-stable.tar.bz2" | tar -xvj
-cd /tmp/x264-snapshot-20191217-2245-stable
-PKG_CONFIG_PATH="/usr/local/lib/pkgconfig" ./configure --prefix="/usr/local" --enable-shared
-make -j$(nproc) install
-rm -rf /tmp/x264-snapshot-20191217-2245-stable
-
-# Build x265
-mkdir -p /tmp/x265_2.7
-cd /tmp
-wget -qO- "https://get.videolan.org/x265/x265_2.7.tar.gz" | tar -xvz
-cd  /tmp/x265_2.7/build/linux
-cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX="/usr/local" -DENABLE_SHARED:bool=on ../../source
-make -j$(nproc) install
-rm -rf /tmp/x265_2.7
-
-# Build fdk-aac
-mkdir -p /tmp/fdk-aac-2.0.2
-cd /tmp
-wget -qO- "https://sourceforge.net/projects/opencore-amr/files/fdk-aac/fdk-aac-2.0.2.tar.gz" | tar -xvz
-cd /tmp/fdk-aac-2.0.2
-autoreconf -fiv
-./configure --prefix="/usr/local" --enable-shared --disable-static
-make -j$(nproc) install
-rm -rf /tmp/fdk-aac-2.0.2
-
-# Build FFmpeg
-cd /tmp
-git clone -b release/4.4 https://git.ffmpeg.org/ffmpeg.git ffmpeg
-cd ffmpeg
-PKG_CONFIG_PATH="/usr/local/lib/pkgconfig"
-./configure  --prefix="/usr/local" --extra-cflags="-I/usr/local/include"   --extra-ldflags="-L/usr/local/lib"  --extra-libs=-lpthread  --extra-libs=-lm  --enable-shared   --disable-static   --enable-libx264  --enable-libx265  --enable-libfdk-aac  --enable-gpl --enable-nonfree
-make -j$(nproc) install
-rm -rf /tmp/ffmpeg
-
-cp /tmp/local-pin-600 /etc/apt/preferences.d
-
-command -v lbzip2 
-ln -sf $(command -v lbzip2) /usr/local/bin/compressor || ln -sf $(command -v bzip2) /usr/local/bin/compressor
-
-# Install Google Benchmark
-mkdir -p /tmp/Gbenchmark 
-cd /tmp/Gbenchmark 
-wget -qO- https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz | tar xz 
-cmake -Sbenchmark-1.6.1 -Bbuild -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_CXX_STANDARD=14 
-make -j -C build 
-cd /tmp/Gbenchmark/build
-make install
-
-# Build boost-1.85.0 from source for RPP
-# Installing in a non-standard location since the test packages of hipFFT and rocFFT pick up the version of
-# the installed Boost library and declare a package dependency on that specific version of Boost.
-# For example, if this was installed in the standard location it would declare a dependency on libboost-dev(el)1.85.0
-# which is not available as a package in any distro.
-# Once this is fixed, we can remove the Boost package from the requirements list and install this
-# in the standard location
-mkdir -p /tmp/boost-1.85.0 
-cd /tmp/boost-1.85.0 
-wget -nv https://sourceforge.net/projects/boost/files/boost/1.85.0/boost_1_85_0.tar.bz2 -O ./boost_1_85_0.tar.bz2 
-tar -xf boost_1_85_0.tar.bz2 --use-compress-program="/usr/local/bin/compressor" 
-cd boost_1_85_0 
-./bootstrap.sh --prefix=${RPP_DEPS_LOCATION} --with-python=python3 
-./b2 stage -j$(nproc) threading=multi link=shared cxxflags="-std=c++11" 
-./b2 install threading=multi link=shared --with-system --with-filesystem 
-./b2 stage -j$(nproc) threading=multi link=static cxxflags="-std=c++11 -fpic" cflags="-fpic"
-./b2 install threading=multi link=static --with-system --with-filesystem 
-rm -rf /tmp/boost-1.85.0
--- a/tools/rocm-build/docker/ubuntu22/packages
+++ b/tools/rocm-build/docker/ubuntu22/packages
@@ -7,7 +7,6 @@ bison
 bridge-utils
 build-essential
 bzip2
-ccache
 check
 chrpath
 cifs-utils
Author	SHA1	Message	Date
yugang-amd	7d57bc520f	Fix broken link for AMDGPU installer (#4989 )	2025-07-02 10:05:48 -04:00
Pratik Basyal	f8d4957944	KMD UMD support footnote update ROCm 633 (#4972 )	2025-06-26 15:33:59 -04:00
Peter Park	306d511aac	[docs/6.3.3] Link to specific ROCm/vLLM readme in inference/vllm-benchmark.rst (#4922 ) * point to specific rocm/vllm readme * fix previous version url	2025-06-13 13:49:18 -04:00
Peter Park	177f5c4461	Merge pull request #4711 from peterjunpark/docs/6.3.3-vllm-benchmark-ver [docs/6.3.3] Use full docker image tag instead of `instinct_main` alias in vllm-benchmark.rst	2025-05-06 12:01:45 -04:00
Pratik Basyal	b92b122236	6.1.5 column added (#4646 )	2025-04-17 11:50:05 -04:00
Istvan Kiss	a479555bd3	Fix broken torchserve link	2025-04-08 17:22:48 +02:00
Peter Park	8430bd089b	Merge pull request #4561 from peterjunpark/docs/6.3.3 Add documentation fixes to docs/6.3.3	2025-04-03 13:16:25 -04:00
Dominic Widdows	86d8bac331	Update workload.rst with small export fix (#4425 ) Tiny fix that removes the "export" directive. ` export HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench ...` leads to bash: export: `hipblaslt-bench': not a valid identifier whereas just starting with HIP_FORCE_DEV_KERNARG=1 passes this env var to the hipblaslt-bench process, which I think is the intention here. (cherry picked from commit `715cce53de`)	2025-04-03 13:11:13 -04:00
Peter Park	214bc23817	fix link to CLR license (#4560 ) (cherry picked from commit `fdf24a9c40`)	2025-04-03 13:11:13 -04:00
Alex Xu	6634ef8f62	update rocm-docs-core	2025-04-02 15:46:41 -04:00
Peter Park	928a972ca2	Merge pull request #4552 from peterjunpark/docs/6.3.3 Fix links in documentation	2025-04-01 15:59:47 -04:00
Peter Park	f1bbe0cef5	Fix more links in documentation (#4551 ) * fix vllm engine args link * remove RDNA subtree in under system optimization in toc * fix RDNA 2 architecture PDF link * fix CLR LICENSE.txt link * fix rocPyDecode license link (cherry picked from commit `ea66bf386a`)	2025-04-01 15:57:45 -04:00
Peter Park	51050661af	Fix links in documentation (cherry picked from commit `ac2c5e72d4`)	2025-04-01 15:40:28 -04:00
Peter Park	00fa68cf1a	Add MaxText training Docker doc (#4543 ) Add MaxText training Docker doc (cherry picked from commit `424e6148bd`)	2025-03-28 11:40:48 -04:00
Peter Park	51be44e941	Add MaxText training Docker doc Add MaxText training Docker doc (cherry picked from commit `424e6148bd`)	2025-03-28 11:26:23 -04:00
Pratik Basyal	bf0fcd370c	ROCprofiler-SDK naming and upcoming changes corrected (#4541 ) * ROCprofiler-SDK naming corrected * Swati's feedback added Co-authored-by: Swati Rawat <120587655+SwRaw@users.noreply.github.com> --------- Co-authored-by: Swati Rawat <120587655+SwRaw@users.noreply.github.com>	2025-03-28 10:09:12 -04:00
Peter Park	c364eb3b53	Merge pull request #4536 from peterjunpark/docs/6.3.3 Improve "tuning guides" landing page (#4504)	2025-03-27 15:35:03 -04:00
Peter Park	5ce76d3b68	Improve "tuning guides" landing page (#4504 ) * Improve "tuning guides" landing page * Update docs/how-to/gpu-performance/mi300x.rst Co-authored-by: Pratik Basyal <pratik.basyal@amd.com> * Update docs/how-to/gpu-performance/mi300x.rst Co-authored-by: Pratik Basyal <pratik.basyal@amd.com> * change tuning to optimization --------- Co-authored-by: Pratik Basyal <pratik.basyal@amd.com> (cherry picked from commit `58d42ec50b`)	2025-03-27 15:01:56 -04:00
Pratik Basyal	76796e56e5	AMD GPU Docs System optimization migration changes in ROCm 6.3.3 Docs (#4534 ) * AMD GPU Docs System optimization migration changes in ROCm Docs (#296) * System optimization migration changes in ROCm * Linting issue fixed * Linking corrected * Minor change * Link updated to Instinct.docs.amd.com * ROCm docs grid updated by removing IOMMU.rst, pcie-atomics, and oversubscription pages * Files removed and reference fixed * Reference text updated * GPU atomics removed	2025-03-26 14:55:13 -04:00
Peter Park	571578a8bd	Merge pull request #4521 from peterjunpark/docs/6.3.3 [6.3.3] Update Megatron-LM doc for 25.4	2025-03-21 17:12:01 -04:00
Peter Park	2415f36078	fix pre version	2025-03-21 16:52:38 -04:00
Peter Park	7df0804c06	Update Megatron-LM doc for 25.4 (#4520 ) * update megatron-lm doc * update 'previous versions' * add missing space * update docker pull tag * Update options and docker pull tag * Add performance measurements link to megatron-lm doc * fix previous versions table * words * Simplify system validation section * minor fixes * fix perv versions tbl (cherry picked from commit `8f359da39e`)	2025-03-21 16:50:38 -04:00
Alex Xu	820db2c544	add 6.1.5 to version list (cherry picked from commit `388f18cf36`)	2025-03-14 11:00:23 -04:00
Peter Park	e7484f25e7	Merge pull request #4495 from peterjunpark/docs/6.3.3 PyTorch training Docker update 25.4 (#4482)	2025-03-13 14:02:25 -04:00
Peter Park	061ae36a84	PyTorch training Docker update 25.4 (#4482 ) * remove orphan tag * add hugging face PEFT * update "previous versions" * data == ultrachat 200k * fix "llama 2" * add ultrachat to wordlist * fix previous versions table * add performance measurements * add mi325x * fix prev version * change 'validation' to 'testing * fix dir name * fix backtick (cherry picked from commit `2fca094531`)	2025-03-13 13:40:53 -04:00
Peter Park	7fad99073b	Merge pull request #4493 from peterjunpark/docs/6.3.3 [docs/6.3.3] Update vLLM performance Docker docs (#4491)	2025-03-13 10:15:39 -04:00
Peter Park	c6ceff11f0	Update vLLM performance Docker docs (#4491 ) * add links to performance results words * change "performance validation" to "performance testing" * update vLLM docker 3/11 * add previous versions add previous versions * fix llama 3.1 8b model repo name * words (cherry picked from commit `9b2ce2b634`)	2025-03-13 10:05:11 -04:00
Peter Park	acc7a23265	Merge pull request #4486 from peterjunpark/docs/6.3.3 [docs/6.3.3] Fix "VGPR" typo in workload tuning guide (#4484)	2025-03-12 16:19:31 -04:00
Peter Park	4a14260695	Fix "VGPR" typo in workload tuning guide (#4484 ) * Fix "VGPR" typo in workload tuning guide * fix wording	2025-03-12 16:17:17 -04:00
Istvan Kiss	f747943d36	Replace "-" on precision support page	2025-03-10 13:41:12 +01:00
Pratik Basyal	3d59247e7a	Content for modprobe added to MI300X system optimization (#4434 ) (#4461 ) Added content for modprobe	2025-03-07 15:05:42 -05:00
Peter Park	d5b1fd4389	Merge pull request #4448 from peterjunpark/docs/6.3.3 Add docs fixes to 6.3.3	2025-03-05 09:18:50 -05:00
Adel Johar	a05d9e2fa0	Docs: use custom directive to reference library versions (cherry picked from commit `cd85ccd539`)	2025-03-05 09:02:15 -05:00
Peter Park	7ddb10a0fc	Fix `applies to linux` tag for training benchmark docker pages (#4446 ) (cherry picked from commit `fa0e212906`)	2025-03-05 09:02:08 -05:00
Peter Park	63f9bc30bd	Merge pull request #4432 from peterjunpark/docs/6.3.3 Update LLM inference performance validation on AMD Instinct MI300X gu…	2025-02-28 13:02:56 -05:00
Peter Park	b174ab767e	Update LLM inference performance validation on AMD Instinct MI300X guide to filter by desired model (#4424 ) * WIP (cherry picked from commit a06a5b5b959a9425e7384fb58b88c3716f380e48) rm unneeded files (cherry picked from commit f1d0c00056a83299bdea74a43cd17454999cf2d8) * add sphinxcontrib.datatemplates (cherry picked from commit d056b93a325d87b81f54f70c6eb4ae78f4fb0bc1) * add template (cherry picked from commit 0691d59f0a1efbda7908762b7a906e30a65c0ee1) fix template (cherry picked from commit 01e4bea5522aa5deeaade58c105ff850f449df8b) WIPO (cherry picked from commit 4d8daf7445e7be92cd9ee1d39dff564bd8de41f4) WIP (cherry picked from commit 9eefd1f5833bc4dc8de9d777ff65a5fe5f826dbd) update models yaml schema (cherry picked from commit a5f0fc1e6cc51104dc2d42029bfcf3eea276d270) add model groups functionality (cherry picked from commit 13f49f96dd3e5a160d37c52e48a4fbcccdcf4f9e) add selector headings and fix template (cherry picked from commit 35f7f2314bcf74b4fd0a8ca10aaabf0de7063bb0) update template (cherry picked from commit 9e2dcfe0c7f6e7c2c685866ea83375fbacbc5032) fix (cherry picked from commit be51e32791550ddc21785effccb889228394b242) use classes instead of data tags (cherry picked from commit cd52d68c504f7e7435d156ae70cf4bde1dfe703e) update template (cherry picked from commit 9ed89fee6874b39ee3535fbde54a0a59f346ea2b) clean up extra wip files (cherry picked from commit a9f965a104baa966c184054638e935b011526278) update wordlist (cherry picked from commit f783656814e896aedd21acd1c8c87b4700c14469) remove unused template (cherry picked from commit cac894bd9c2b1262c9c006e5fddbcb742dc6d882) improve script (cherry picked from commit ca20ffd4922916616e0924d625652a815f27c35f) fix template (cherry picked from commit 752c61fda856fd5b244734636c036c8877e823b9) fix standalone benchmark output path in template (cherry picked from commit d8c04203b5ec0f6c2e2307f7890304a3dc5687be) fix toc (cherry picked from commit 8df42faf53488ef29f5a263d25032f3d35cd58ed) update script to prevent flash of unstyled content import a11y (cherry picked from commit 46c852717f223a1d8744fab035807cebab4c5404) add tabindex to wordlist (cherry picked from commit 11492593f9692f5453045e7ec52c8f8ae9624ae9) text update script * remove unused config option * reorganize assets * fix linting warning * move js from data/ to extension/	2025-02-28 12:40:43 -05:00
Istvan Kiss	f75ef9e2c1	Fix white paper links	2025-02-28 15:03:59 +01:00
Adel Johar	e5bf76ead1	Merge pull request #4422 from ROCm/docs_6.3.3_update_fix_arch Merge pull request #4393 from ROCm/docs_fix_arch	2025-02-28 14:09:20 +01:00
Adel Johar	5393e90a8e	Merge pull request #4393 from ROCm/docs_fix_arch Docs: Fix gpu-arch-spec.rst	2025-02-27 16:35:33 +01:00
Peter Park	fbc2815223	Merge pull request #4417 from peterjunpark/docs/6.3.3 [docs/6.3.3] Update PT and TF docker inventories in compatibility docs (#4415)	2025-02-26 09:28:30 -05:00
Peter Park	2b96a37b08	Fix tensorflow-rocm repo.radeon.com url	2025-02-25 12:58:02 -05:00
Peter Park	1e5ad14d86	Update PT and TF docker inventories in compatibility docs (#4415 ) * update PyTorch docker inventories in compatibility doc * update TF docker inventories in compatibility doc * update text to rocm 6.3.3 (cherry picked from commit `934767322b`)	2025-02-25 12:38:25 -05:00
Peter Park	f9d6bd4db8	Merge pull request #4410 from peterjunpark/docs/6.3.3 [docs/6.3.3] fix tab sync and nested tab Megatron-LM doc (#4409)	2025-02-21 17:23:06 -05:00
Peter Park	23e78c8d55	fix tab sync and nested tab Megatron-LM doc (#4409 ) (cherry picked from commit `1ea1c5c6e0`)	2025-02-21 17:20:15 -05:00
Peter Park	0edd31bde6	Merge pull request #4408 from peterjunpark/docs/6.3.3 Update docs on Megatron-LM and PyTorch training Dockers (#4407)	2025-02-21 13:29:10 -05:00
Peter Park	4af488e27d	Update docs on Megatron-LM and PyTorch training Dockers (#4407 ) * Update Megatron-LM and PyTorch Training Docker docs Also restructure TOC * Apply suggestions from code review Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> update "start training" text Apply suggestions from code review Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> update conf.py fix spacing fix branding issue add disable numa reorg remove extra text (cherry picked from commit `389fa7071b`)	2025-02-21 13:10:42 -05:00
Parag Bhandari	7ae7046301	Merge branch 'roc-6.3.x' into docs/6.3.3	2025-02-19 17:25:14 -05:00
Parag Bhandari	358092386e	Merge branch 'develop' into roc-6.3.x	2025-02-19 17:25:03 -05:00
Parag Bhandari	e071738908	Merge branch 'roc-6.3.x' into docs/6.3.3	2025-02-19 17:22:38 -05:00
pbhandar-amd	cd79403931	Update vllm-benchmark.rst	2025-02-19 17:21:29 -05:00
Parag Bhandari	275ef1d511	Merge branch 'roc-6.3.x' into docs/6.3.3	2025-02-19 16:41:11 -05:00
Parag Bhandari	065fe8b138	Merge branch 'develop' into roc-6.3.x	2025-02-19 16:30:33 -05:00
Parag Bhandari	be36c1808e	Merge branch 'develop' into docs/6.3.3	2025-02-19 15:34:46 -05:00
Parag Bhandari	64c362a961	Manually update requirements.in and txt	2025-02-19 11:35:30 -05:00
pbhandar-amd	d392eca232	Update documentation requirements	2025-02-19 11:10:09 -05:00
pbhandar-amd	1b58c08394	Sync develop into docs/6.3.3	2025-02-18 14:05:45 -05:00
alexxu-amd	73ab81fbaf	Merge pull request #4314 from amd-jnovotny/ai-tutorials-link-roc63x Cherry-pick to roc-6.3.x: Add ToC and index links to the AI Developer Tutorials (#4312)	2025-01-29 16:44:22 -05:00
Jeffrey Novotny	ddfb5bda12	Add ToC and index links to the AI Developer Tutorials (#4312 ) * Add ToC and index links to the AI Developer Tutorials * Change link positioning * Change wording (cherry picked from commit `d401b5f152`)	2025-01-29 14:45:32 -05:00
Alex Xu	ae7f47a0a2	Merge branch 'develop' into roc-6.3.x	2025-01-28 17:05:44 -05:00
Alex Xu	5e5f7d6bb7	Merge branch 'develop' into roc-6.3.x	2025-01-28 16:41:02 -05:00
Alex Xu	da1125e228	Merge branch 'develop' into roc-6.3.x	2025-01-28 14:25:35 -05:00
Alex Xu	e55b9f2a33	Merge branch 'develop' into roc-6.3.x	2025-01-28 14:18:28 -05:00
Yanyao Wang	761a524d03	Merge pull request #4225 from WBobby/roc-6.3.x Fix miopen-deps build issue by updating rocm-recipes for boost link	2025-01-06 10:03:50 -06:00
Wang, Yanyao	c895ee483c	Fix miopen-deps build issue by updating rocm-recipes for boost link Signed-off-by: Wang, Yanyao <yanyao.wang@amd.com>	2025-01-05 18:07:31 -08:00
Yanyao Wang	e049d952d4	Merge pull request #4221 from WBobby/roc-6.3.x Add the required manifest file into roc-6.3.x branch	2025-01-03 11:21:45 -06:00
Wang, Yanyao	ce41922bb5	Update the base docker images for ROCm6.3	2025-01-03 08:10:06 -08:00
Wang, Yanyao	2b53b40caa	Add manifest file for ROCm6.3.1	2025-01-03 08:07:38 -08:00
Peter Park	9250e1ba28	Fix PyTorch Compatibility link and remove incomplete rows (#4195 ) * fix pytorch-compatibility filename fix links * remove incomplete rows in pytorch-compatibility * fix broken refs	2024-12-24 13:51:33 -05:00
alexxu-amd	3c055ab65b	Change version variable to latest Since gpu-cluster-networking gets moved to dcgpu. All versioning will be renamed.	2024-12-24 13:51:33 -05:00
Peter Park	44aaf1b57c	Add PyTorch compatibility doc (#4193 ) * Add compatibility framework pages * update formatting * WIP * satisfy spellcheck linter * PR feedbacks * caps * remove jax and tensorflow pages * comment out "?"s * update wordlist * fix toc and table * update toc and deep-learning-rocm.rst --------- Co-authored-by: Istvan Kiss <neon60@gmail.com>	2024-12-24 13:51:33 -05:00
alexxu-amd	822e789998	Update index.md	2024-12-24 13:51:33 -05:00
alexxu-amd	243ac78609	Update _toc.yml.in	2024-12-24 13:51:33 -05:00
Daniel Su	c2f483332f	External CI: revert sync changes (#4191 )	2024-12-24 13:51:33 -05:00
dependabot[bot]	b35267b6bd	Build(deps): Bump rocm-docs-core from 1.11.0 to 1.12.0 in /docs/sphinx (#4167 ) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.11.0 to 1.12.0. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.11.0...v1.12.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-12-24 13:51:33 -05:00
Alex Xu	deb4895b11	Merge branch 'develop' into roc-6.3.x	2024-12-20 18:42:53 -05:00
Yanyao Wang	8c036531e8	Merge pull request #4163 from WBobby/roc-6.3.x-pr Update build scripts of ROCm6.3 release to roc-6.3.x branch	2024-12-16 12:23:11 -06:00
Wang, Yanyao	484cbefc2e	Update build scripts of ROCm6.3 release to roc-6.3.x branch	2024-12-15 17:35:58 -08:00
alexxu-amd	721b60d52f	Merge pull request #4155 from amd-jnovotny/user-kernel-space-rocm-roc63x Cherry-pick to roc-6.3.x: Change reference to kernel-mode GPU compute driver in ROCm (#4147)	2024-12-13 13:15:06 -05:00
Jeffrey Novotny	8ebe7be283	Change reference to kernel-mode GPU compute driver in ROCm (#4147 ) * Change reference to kernel-mode GPU compute driver in ROCm * More changes for kernel-mode terminology * Fix linting (cherry picked from commit `04fdc08328`)	2024-12-13 12:13:15 -05:00
Sam Wu	7e8947fdb4	Merge pull request #4128 from ROCm/develop Merge develop into roc-6.3.x	2024-12-06 11:34:46 -07:00
Sam Wu	66cac5301f	Merge pull request #4113 from ROCm/develop Merge develop into roc-6.3.x	2024-12-05 09:35:17 -07:00
Sam Wu	9f3a1de117	Merge branch 'develop' into roc-6.3.x	2024-12-04 19:34:29 -07:00
Sam Wu	0915fb17e8	Merge pull request #4109 from ROCm/develop fix links to smi tools full changelog on GH (#4108) in 6.3 release branch	2024-12-04 19:08:06 -07:00
Sam Wu	0d3eb1d774	Merge pull request #4104 from ROCm/develop Merge develop into ROCm 6.3 release branch	2024-12-04 17:09:23 -07:00
Sam Wu	7a258cdba9	Merge pull request #4093 from ROCm/develop Merge develop into roc-6.3.x	2024-12-03 16:17:01 -07:00