Create submodules to the ROCm 6.2.0 components.

- Synced to rocm-6.2.0 - Saved to the ./libs folder - Add a script to help update submodules when the next version of ROCm is released. Saved in ./tools/submodules - Update README to remove `repo` instructions and add `git submodule` instructions. Signed-off-by: David Galiffi <David.Galiffi@amd.com>
fix ROCgdb comp name (#3555 )
2026-01-13 08:38:04 -05:00 · 2024-08-09 16:58:57 -04:00 · 2024-08-08 15:54:37 -04:00 · 2024-08-08 15:30:45 -04:00 · 2024-08-08 15:20:57 -04:00 · 2024-08-08 12:49:34 -06:00
222 changed files with 7605 additions and 13020 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -87,7 +87,7 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=gfx942
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
        -DMIGRAPHX_USE_COMPOSABLEKERNEL=OFF
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -40,7 +40,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  pool: ${{ variables.LARGE_DISK_BUILD_POOL }}
  workspace:
    clean: all
  steps:
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -19,6 +19,7 @@ parameters:
  type: object
  default:
    - clr
+    - hipRAND
    - llvm-project
    - rocBLAS
    - rocm-cmake
@@ -26,6 +27,7 @@ parameters:
    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime
+    - rocRAND
    - ROCT-Thunk-Interface

 jobs:
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -25,7 +25,7 @@ parameters:

 jobs:
 - job: composable_kernel
-  timeoutInMinutes: 210
+  timeoutInMinutes: 100
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -59,6 +59,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DINSTANCES_ONLY=ON
+        -DGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -77,7 +77,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DHIP_PLATFORM=amd
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -8,12 +8,13 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - ninja-build
-    - python3-venv
-    - libmsgpack-dev
+    - gfortran
    - git
-    - python3-pip
    - libdrm-dev
+    - libmsgpack-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
 - name: pipModules
  type: object
  default:
@@ -21,15 +22,16 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - hipBLAS
+    - llvm-project
    - rocminfo
    - rocprofiler-register
-    - hipBLAS
+    - ROCR-Runtime

 jobs:
 - job: hipBLASLt
+  timeoutInMinutes: 300
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -58,7 +60,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
-  # CI case: download latest default branch build
+# CI case: download latest default branch build
  - ${{ if eq(parameters.checkoutRef, '') }}:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -72,17 +74,38 @@ jobs:
        dependencySource: tag-builds
  - script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
    displayName: ROCm symbolic link
+# Build and install gtest, lapack, hipBLAS-common
+# $(Pipeline.Workspace)/deps is a temporary folder for the build process
+# $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
+  - script: mkdir $(Pipeline.Workspace)/deps
+# hipBLASLt already has a CMake script for external deps, so we can just run that
+# https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
+  - script: cmake $(Pipeline.Workspace)/s/deps
+    displayName: Configure hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+  - script: make
+    displayName: Build hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+  - script: sudo make install
+    displayName: Install hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+# Set link to redirect llvm folder
+  - task: Bash@3
+    displayName: Symlink to rocm/lib/llvm
+    inputs:
+      targetType: inline
+      script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx90a
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LIBRARY_FORMAT=msgpack
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -57,6 +57,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -66,7 +66,7 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -61,6 +61,6 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -74,7 +74,6 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_TESTS=ON
        -DUSE_CUDA=OFF
        -GNinja
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -75,7 +75,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=all
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -58,6 +58,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
        -DHIPTENSOR_BUILD_TESTS=ON
-        -DAMDGPU_TARGETS=gfx90a
+        -DAMDGPU_TARGETS=gfx942
      multithreadFlag: -- -j32
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -72,6 +72,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TESTS=ON
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -67,7 +67,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -108,7 +108,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LOGIC=asm_full
        -DTensile_SEPARATE_ARCHITECTURES=ON
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -11,6 +11,7 @@ parameters:
    - cmake
    - ninja-build
    - libboost-program-options-dev
+    - libdrm-dev
    - libgtest-dev
    - libfftw3-dev
    - python3-pip
@@ -64,7 +65,7 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -59,7 +59,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_BENCHMARK=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_TEST=ON
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -59,6 +59,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -82,7 +82,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -68,7 +68,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_SAMPLES=OFF
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -60,7 +60,7 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_TEST=ON
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -65,7 +65,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DROCWMMA_BUILD_TESTS=ON
        -DROCWMMA_BUILD_SAMPLES=OFF
-        -DGPU_TARGETS=gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
 # gfx1030 not supported in documentation
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -66,6 +66,6 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DROCM_ROOT=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_HIP_ARCHITECTURES=gfx1030;gfx1100
+        -DCMAKE_HIP_ARCHITECTURES=gfx942
        -DCMAKE_EXE_LINKER_FLAGS=-fgpu-rdc
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -105,5 +105,5 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DENABLE_LDCONFIG=OFF
        -DUSE_PROF_API=1
-        -DGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=gfx942
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -37,6 +37,12 @@ jobs:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
    clean: all
+  strategy:
+    matrix:
+      gfx942:
+        JOB_GPU_TARGET: gfx942
+      gfx90a:
+        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -65,6 +71,8 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=$(JOB_GPU_TARGET)
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
--- a/.azuredevops/components/rpp.yml
+++ b/.azuredevops/components/rpp.yml
@@ -60,6 +60,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIRS=$(Agent.BuildDirectory)/rocm/include
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -0,0 +1,252 @@
+parameters:
+# ubuntu near-equivalent list of yum installs in https://github.com/ROCm/ROCm-docker/blob/master/dev/Dockerfile-centos-7-complete
+# plus additional packages found through iterative testing of pipeline
+- name: aptPackages
+  type: object
+  default:
+    - build-essential
+    - git
+    - ninja-build
+    - openjdk-8-jdk
+    - ca-certificates
+    - bc
+    - bridge-utils
+    - cmake
+    - devscripts
+    - dkms
+    - doxygen
+    - libdpkg-dev
+    - libdpkg-perl
+    - libelf-dev
+    - python3-dev
+    - python3-pip
+    - python3-venv
+    - wget
+    - ncurses-base
+    - libncurses-dev
+    - numactl
+    - libnuma-dev
+    - libssh-dev
+    - libunwind-dev
+    - llvm-dev
+    - libpth-dev
+    - qemu-kvm
+    - re2c
+    - subversion
+    - fakeroot
+    - autoconf
+    - libgomp1
+    - libtinfo-dev
+    - libcholmod3
+    - libsuitesparseconfig5
+    - libstdc++-12-dev
+    - python-is-python3
+    - gfortran
+    - libgfortran5
+    - liblapack3
+    - libblas3
+    - libquadmath0
+    - libmetis5
+    - libamd2
+    - libcamd2
+    - libcolamd2
+    - libccolamd2
+    - libdrm-amdgpu1
+    - ccache
+    - zip
+- name: pipModules
+  type: object
+  default:
+    - astunparse
+    - expecttest!=0.2.0
+    - hypothesis
+    - numpy
+    - psutil
+    - pyyaml
+    - requests
+    - setuptools
+    - types-dataclasses
+    - typing-extensions>=4.8.0
+    - sympy>=1.13.0
+    - filelock
+    - networkx
+    - jinja2
+    - fsspec
+    - lintrunner
+    - ninja
+    - packaging
+    - optree>=0.12.0
+# list from https://github.com/pytorch/builder/blob/main/manywheel/build_rocm.sh
+- name: rocmDependencies
+  type: object
+  default:
+    - rocminfo
+    - MIOpen
+    - clr
+    - hipBLAS
+    - hipFFT
+    - hipRAND
+    - hipSOLVER
+    - hipSPARSE
+    - ROCR-Runtime
+    - llvm-project
+    - rccl
+    - rocBLAS
+    - rocFFT
+    - rocm_smi_lib
+    - rocRAND
+    - rocSOLVER
+    - rocSPARSE
+    - roctracer
+    - hipBLASLt
+    - rocprofiler-register
+    - rocm-core
+    - rocPRIM
+    # below are additional dependencies not called out by build script, but throw errors during cmake
+    - hipCUB
+    - rocThrust
+
+trigger: none
+pr: none
+schedules:
+- cron: '30 7 * * *'
+  displayName: nightly pytorch
+  branches:
+    include:
+    - develop
+  always: true
+
+jobs:
+- job: pytorch
+  timeoutInMinutes: 120
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+# various flags/parameters expected by bash scripts in pytorch builder repo
+  - name: ROCM_VERSION
+    value: 6.3.0
+  - name: PYTORCH_ROCM_ARCH
+    value: gfx942
+  - name: GPU_TARGET
+    value: gfx942
+  - name: ROCM_PATH
+    value: /opt/rocm
+  - name: DESIRED_CUDA
+    value: 6.3.0
+  - name: MKLROOT
+    value: /opt/intel
+  - name: AOTRITON_INSTALLED_PREFIX
+    value: /opt/rocm/aotriton
+  - name: DESIRED_PYTHON
+    value: 3.10
+  - name: PYTORCH_ROOT
+    value: $(Build.SourcesDirectory)/pytorch
+  - name: CMAKE_ARGS
+    value: -GNinja
+  - name: DESIRED_DEVTOOLSET
+    value: cxx11-abi
+  pool: ${{ variables.ULTRA_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+# copy environment setup from https://github.com/pytorch/builder/blob/main/manywheel/Dockerfile
+# but instead of centos, use ubuntu environment
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+# wheel install location different on azure agent compared to where wheel is assumed to be installed on upstream script
+  - task: Bash@3
+    displayName: wheel install path symlink
+    inputs:
+      targetType: inline
+      script: |
+        sudo mkdir -p /opt/python/cp310-cp310/lib/python3.10
+        sudo ln -s /usr/local/lib/python3.10/dist-packages /opt/python/cp310-cp310/lib/python3.10/site-packages
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      dependencySource: staging
+  - task: Bash@3
+    displayName: ROCm symbolic links
+    inputs:
+      targetType: inline
+      script: |
+        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+        sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+  - checkout: self
+  - task: Bash@3
+    displayName: git clone pytorch builder
+    inputs:
+      targetType: inline
+      script: git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: git clone upstream pytorch
+    inputs:
+      targetType: inline
+      script: git clone https://github.com/pytorch/pytorch.git --depth=1 --recurse-submodules
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Patch out forced GPU testing block in pytorch build script
+    inputs:
+      targetType: inline
+      script: git apply $(Build.SourcesDirectory)/.azuredevops/patches/pytorch/0001-ROCm-CI-patches.patch
+      workingDirectory: $(Build.SourcesDirectory)/builder
+  - task: Bash@3
+    displayName: Install patchelf
+    inputs:
+      targetType: inline
+      script: |
+        sudo bash pytorch/.ci/docker/common/install_patchelf.sh
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install mkl dependency for magma
+    inputs:
+      targetType: inline
+      script: |
+        sudo bash pytorch/.ci/docker/common/install_mkl.sh
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install rocm drm
+    inputs:
+      targetType: inline
+      script: |
+        sudo bash pytorch/.ci/docker/common/install_rocm_drm.sh
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install rocm magma
+    inputs:
+      targetType: inline
+      script: |
+        sudo PYTORCH_ROCM_ARCH=$(PYTORCH_ROCM_ARCH) MKLROOT=$(MKLROOT) bash pytorch/.ci/docker/common/install_rocm_magma.sh
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install AOTriton Shared Library
+    inputs:
+      targetType: inline
+      script: |
+        sudo bash ./install_aotriton.sh /opt/rocm
+      workingDirectory: $(Build.SourcesDirectory)/pytorch/.ci/docker/common
+  - task: Bash@3
+    displayName: Run ROCm Build Script
+    inputs:
+      targetType: inline
+      script: >-
+        sudo
+        DESIRED_CUDA=$(DESIRED_CUDA)
+        PYTORCH_ROCM_ARCH=$(PYTORCH_ROCM_ARCH)
+        DESIRED_PYTHON=$(DESIRED_PYTHON)
+        PYTORCH_ROOT=$(PYTORCH_ROOT)
+        CMAKE_ARGS=$(CMAKE_ARGS)
+        AOTRITON_INSTALLED_PREFIX=$(AOTRITON_INSTALLED_PREFIX)
+        DESIRED_DEVTOOLSET=$(DESIRED_DEVTOOLSET)
+        bash ./manywheel/build_rocm.sh
+      workingDirectory: $(Build.SourcesDirectory)/builder
+  - task: PublishPipelineArtifact@1
+    displayName: 'ROCm pytorch wheel file Publish'
+    retryCountOnTaskFailure: 3
+    inputs:
+      targetPath: /remote/wheelhouserocm$(ROCM_VERSION)
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -0,0 +1,115 @@
+parameters:
+# currently excludes clr and rocm-examples
+- name: rocmDependencies
+  type: object
+  default:
+    - AMDMIGraphX
+    - amdsmi
+    - aomp-extras
+    - aomp
+    - composable_kernel
+    - half
+    - HIP
+    - hipBLAS
+    - hipBLASLt
+    - hipCUB
+    - hipFFT
+    - hipfort
+    - HIPIFY
+    - hipRAND
+    - hipSOLVER
+    - hipSPARSE
+    - hipSPARSELt
+    - hipTensor
+    - llvm-project
+    - MIOpen
+    - MIVisionX
+    - rccl
+    - rdc
+    - rocAL
+    - rocALUTION
+    - rocBLAS
+    - ROCdbgapi
+    - rocDecode
+    - rocFFT
+    - ROCgdb
+    - rocm-cmake
+    - rocm-core
+    - rocminfo
+    - rocMLIR
+    - ROCmValidationSuite
+    - rocm_bandwidth_test
+    - rocm_smi_lib
+    - rocPRIM
+    - rocprofiler-register
+    - rocprofiler
+    - ROCR-Runtime
+    - rocRAND
+    - rocr_debug_agent
+    - rocSOLVER
+    - rocSPARSE
+    - ROCT-Thunk-Interface
+    - rocThrust
+    - roctracer
+    - rocWMMA
+    - rpp
+
+trigger: none
+pr: none
+schedules:
+- cron: '30 7 * * *'
+  displayName: Nightly build
+  branches:
+    include:
+    - develop
+  always: true
+
+jobs:
+- job: rocm_nightly
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - task: DeleteFiles@1
+    displayName: 'Cleanup checkout space'
+    inputs:
+      SourceFolder: '$(Agent.BuildDirectory)/s'
+      Contents: '**/*'
+  - task: DeleteFiles@1
+    displayName: 'Cleanup Staging Area'
+    inputs:
+      SourceFolder: '$(Build.ArtifactStagingDirectory)'
+      Contents: '/**/*'
+      RemoveDotFiles: true
+  - script: sudo chmod 777 /mnt
+    displayName: 'Set permissions for /mnt'
+  - script: df -h
+    displayName: System disk space before ROCm
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      dependencySource: staging
+      extractToMnt: true
+      skipLibraryLinking: true
+  - script: df -h
+    displayName: System disk space after ROCm
+  - script: du -sh /mnt/rocm
+    displayName: Uncompressed ROCm size
+  - task: ArchiveFiles@2
+    displayName: Compress rocm-nightly
+    inputs:
+      rootFolderOrFile: /mnt/rocm
+      includeRootFolder: false
+      archiveType: tar
+      tarCompression: gz
+      archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204.tar.gz
+  - script: du -sh $(Build.ArtifactStagingDirectory)
+    displayName: Compressed ROCm size
+  - task: PublishPipelineArtifact@1
+    displayName: 'Public ROCm Nightly Artifact'
+    retryCountOnTaskFailure: 3
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/patches/pytorch/0001-ROCm-CI-patches.patch
+++ b/.azuredevops/patches/pytorch/0001-ROCm-CI-patches.patch
@@ -0,0 +1,40 @@
+From b2d3c88f7a8b179e814e72f76f27e25c82659200 Mon Sep 17 00:00:00 2001
+From: Joseph Macaranas <Joseph.Macaranas@amd.com>
+Date: Tue, 30 Jul 2024 05:43:06 -0400
+Subject: [PATCH] ROCm CI patches
+
+---
+ manywheel/build_common.sh | 9 ---------
+ 1 file changed, 9 deletions(-)
+
+diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh
+index 08ca924..52c468f 100644
+--- a/manywheel/build_common.sh
+++ b/manywheel/build_common.sh
+@@ -475,11 +475,9 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
+   fi
+ 
+   pip uninstall -y "$TORCH_PACKAGE_NAME"
+-  
+   if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+     pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+   fi
+-  
+   pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+ 
+   # Print info on the libraries installed in this wheel
+@@ -491,11 +489,4 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
+       ldd "$installed_lib" || true
+   done
+ 
+-  # Run the tests
+-  echo "$(date) :: Running tests"
+-  pushd "$PYTORCH_ROOT"
+-  LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
+-          "${SOURCE_DIR}/../run_tests.sh" manywheel "${py_majmin}" "$DESIRED_CUDA"
+-  popd
+-  echo "$(date) :: Finished tests"
+ fi
+-- 
+2.44.0.windows.1
+
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -9,9 +9,15 @@ parameters:
 - name: useDefaultBranch
  type: boolean
  default: true
+- name: latestFromBranch
+  type: boolean
+  default: true
 - name: extractToMnt
  type: boolean
  default: false
+- name: fileFilter
+  type: string
+  default: ''
 - name: defaultBranchList
  type: object
  default:
@@ -21,7 +27,7 @@ parameters:
    aomp: aomp-dev
    clr: develop
    composable_kernel: develop
-    half: master
+    half: rocm
    HIP: develop
    hipBLAS: develop
    hipBLASLt: develop
@@ -42,10 +48,10 @@ parameters:
    rocAL: develop
    rocALUTION: develop
    rocBLAS: develop
-    ROCdbgapi : amd-master
+    ROCdbgapi : amd-staging
    rocDecode: develop
    rocFFT: develop
-    rocgdb: amd-staging
+    ROCgdb: amd-staging
    rocm-cmake: develop
    rocm-core: master
    rocm-examples: develop
@@ -67,10 +73,6 @@ parameters:
    roctracer: amd-master
    rocWMMA: develop
    rpp: master
- name: componentsFailureOkay
-  type: object
-  default:
-    - rocm-cmake
 # BELOW REQUIRED IF useDefaultBranch false
 - name: branchName
  type: string
@@ -84,11 +86,15 @@ steps:
    project: ROCm-CI
    definition: ${{ parameters.pipelineId }}
    specificBuildWithTriggering: true
+    itemPattern: '**/*${{ parameters.fileFilter }}*'
+    ${{ if eq(parameters.latestFromBranch, true) }}:
+      ${{ if notIn(parameters.componentName, 'aomp', 'clr', 'rocMLIR') }}: # remove this once these pipelines are functional + up-to-date
+        buildVersionToDownload: latestFromBranch # default is 'latest'
    ${{ if eq(parameters.useDefaultBranch, true) }}:
-      branchName: ${{ parameters.defaultBranchList[parameters.componentName] }}
+      branchName: refs/heads/${{ parameters.defaultBranchList[parameters.componentName] }}
    ${{ else }}:
      branchName: ${{ parameters.branchName }}
-    ${{ if in(parameters.componentName, parameters.componentsFailureOkay) }}:
+    ${{ if in(parameters.componentName, 'rocm-cmake') }}:
      allowPartiallySucceededBuilds: true
    targetPath: '$(Pipeline.Workspace)/d'
 - task: ExtractFiles@1
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -9,6 +9,9 @@ parameters:
 - name: publish
  type: boolean
  default: true
+- name: gpuTarget
+  type: string
+  default: ''

 steps:
 - task: ArchiveFiles@2
@@ -17,7 +20,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.SourceBranchName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.SourceBranchName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.artifactName }}_${{ parameters.gpuTarget }}.tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -11,6 +11,9 @@ parameters:
 - name: cmakeBuildDir
  type: string
  default: 'build'
+- name: cmakeSourceDir
+  type: string
+  default: '..'
 - name: cmakeTarget
  type: string
  default: 'install'
@@ -35,9 +38,11 @@ steps:
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
    ${{ if eq(parameters.customInstallPath, true) }}:
-      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ..
+      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
    ${{ else }}:
      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
+- script: df -h
+  displayName: Disk space before build
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
@@ -46,6 +51,8 @@ steps:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
    cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
    retryCountOnTaskFailure: 10
+- script: df -h
+  displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
 - ${{ if eq(parameters.installEnabled, true) }}:
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -21,6 +21,9 @@ parameters:
 - name: fixedComponentName
  type: string
  default: ''
+- name: latestFromBranch
+  type: boolean
+  default: true
 # match case of the repo in this object for the left side of the maps
 # should not need to replace these parameters
 - name: stagingPipelineIdentifiers
@@ -141,25 +144,51 @@ parameters:

 steps:
 # assuming artifact-download.yml template file in same directory
+# for the case where rocm dependency item in list has a colon (:)
+# assume it is of the format of componentName:fileFilter
+# fileFilter could contain both a subcomponent name or gpu name separated by asterisks
+# e.g., gfx942 to only download artifacts from component for this gpu if applicable
 - ${{ each dependency in parameters.dependencyList }}:
-  - ${{ if eq(parameters.dependencySource, 'staging') }}:
-    - template: artifact-download.yml
-      parameters:
-        componentName: ${{ dependency }}
-        pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
-        extractToMnt: ${{ parameters.extractToMnt }}
-  - ${{ if eq(parameters.dependencySource, 'tag-builds') }}:
-    - template: artifact-download.yml
-      parameters:
-        componentName: ${{ dependency }}
-        pipelineId: ${{ parameters.taggedPipelineIdentifiers[dependency] }}
-        extractToMnt: ${{ parameters.extractToMnt }}
+  - ${{ if contains(dependency, ':') }}:
+    - ${{ if eq(parameters.dependencySource, 'staging') }}:
+      - template: artifact-download.yml
+        parameters:
+          componentName: ${{ split(dependency, ':')[0] }}
+          pipelineId: ${{ parameters.stagingPipelineIdentifiers[split(dependency, ':')[0]] }}
+          fileFilter: ${{ split(dependency, ':')[1] }}
+          latestFromBranch: ${{ parameters.latestFromBranch }}
+          extractToMnt: ${{ parameters.extractToMnt }}
+    - ${{ if eq(parameters.dependencySource, 'tag-builds') }}:
+      - template: artifact-download.yml
+        parameters:
+          componentName: ${{ split(dependency, ':')[0] }}
+          pipelineId: ${{ parameters.taggedPipelineIdentifiers[split(dependency, ':')[0]] }}
+          fileFilter: ${{ split(dependency, ':')[1] }}
+          latestFromBranch: false
+          extractToMnt: ${{ parameters.extractToMnt }}
+# no colon (:) found in this item in the list
+  - ${{ else }}:
+    - ${{ if eq(parameters.dependencySource, 'staging') }}:
+      - template: artifact-download.yml
+        parameters:
+          componentName: ${{ dependency }}
+          pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
+          latestFromBranch: ${{ parameters.latestFromBranch }}
+          extractToMnt: ${{ parameters.extractToMnt }}
+    - ${{ if eq(parameters.dependencySource, 'tag-builds') }}:
+      - template: artifact-download.yml
+        parameters:
+          componentName: ${{ dependency }}
+          pipelineId: ${{ parameters.taggedPipelineIdentifiers[dependency] }}
+          latestFromBranch: false
+          extractToMnt: ${{ parameters.extractToMnt }}
 # fixed case only accepts one component at a time, so no array input
 - ${{ if eq(parameters.dependencySource, 'fixed') }}:
  - template: artifact-download.yml
    parameters:
      componentName: ${{ parameters.fixedComponentName }}
      pipelineId: ${{ parameters.fixedPipelineIdentifier }}
+      latestFromBranch: false
      extractToMnt: ${{ parameters.extractToMnt }}
 - task: Bash@3
  displayName: 'list downloaded ROCm files'
--- a/.azuredevops/templates/steps/preamble.yml
+++ b/.azuredevops/templates/steps/preamble.yml
@@ -12,7 +12,7 @@ steps:
  inputs:
    targetType: inline
    script: python3 --version
- script: pip list
+- script: pip list -v
  displayName: 'list python packages'
 - task: DeleteFiles@1
  displayName: 'Cleanup checkout space'
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -21,6 +21,8 @@ variables:
  value: rocm-ci_ultra_build_pool
 - name: ON_PREM_BUILD_POOL
  value: rocm-ci_build_pool
+- name: LARGE_DISK_BUILD_POOL
+  value: rocm-ci_larger_base_disk_pool
 - name: LATEST_RELEASE_TAG
  value: rocm-6.1.0
 - name: DOCKER_IMAGE_NAME
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,183 @@
+[submodule "libs/ROCK-Kernel-Driver"]
+	path = libs/ROCK-Kernel-Driver
+	url = ../ROCK-Kernel-Driver
+[submodule "libs/ROCT-Thunk-Interface"]
+	path = libs/ROCT-Thunk-Interface
+	url = ../ROCT-Thunk-Interface
+[submodule "libs/ROCR-Runtime"]
+	path = libs/ROCR-Runtime
+	url = ../ROCR-Runtime
+[submodule "libs/amdsmi"]
+	path = libs/amdsmi
+	url = ../amdsmi
+[submodule "libs/rocm_smi_lib"]
+	path = libs/rocm_smi_lib
+	url = ../rocm_smi_lib
+[submodule "libs/rocm-core"]
+	path = libs/rocm-core
+	url = ../rocm-core
+[submodule "libs/rocm-cmake"]
+	path = libs/rocm-cmake
+	url = ../rocm-cmake
+[submodule "libs/rocminfo"]
+	path = libs/rocminfo
+	url = ../rocminfo
+[submodule "libs/rocm_bandwidth_test"]
+	path = libs/rocm_bandwidth_test
+	url = ../rocm_bandwidth_test
+[submodule "libs/rocprofiler"]
+	path = libs/rocprofiler
+	url = ../rocprofiler
+[submodule "libs/roctracer"]
+	path = libs/roctracer
+	url = ../roctracer
+[submodule "libs/rdc"]
+	path = libs/rdc
+	url = ../rdc
+[submodule "libs/HIP"]
+	path = libs/HIP
+	url = ../HIP
+[submodule "libs/clr"]
+	path = libs/clr
+	url = ../clr
+[submodule "libs/hipother"]
+	path = libs/hipother
+	url = ../hipother
+[submodule "libs/HIPIFY"]
+	path = libs/HIPIFY
+	url = ../HIPIFY
+[submodule "libs/HIPCC"]
+	path = libs/HIPCC
+	url = ../HIPCC
+[submodule "libs/llvm-project"]
+	path = libs/llvm-project
+	url = ../llvm-project
+[submodule "libs/ROCm-Device-Libs"]
+	path = libs/ROCm-Device-Libs
+	url = ../ROCm-Device-Libs
+[submodule "libs/ROCm-CompilerSupport"]
+	path = libs/ROCm-CompilerSupport
+	url = ../ROCm-CompilerSupport
+[submodule "libs/half"]
+	path = libs/half
+	url = ../half
+[submodule "libs/ROCgdb"]
+	path = libs/ROCgdb
+	url = ../ROCgdb
+[submodule "libs/ROCdbgapi"]
+	path = libs/ROCdbgapi
+	url = ../ROCdbgapi
+[submodule "libs/rocr_debug_agent"]
+	path = libs/rocr_debug_agent
+	url = ../rocr_debug_agent
+[submodule "libs/rocBLAS"]
+	path = libs/rocBLAS
+	url = ../rocBLAS
+[submodule "libs/Tensile"]
+	path = libs/Tensile
+	url = ../Tensile
+[submodule "libs/hipTensor"]
+	path = libs/hipTensor
+	url = ../hipTensor
+[submodule "libs/hipBLAS"]
+	path = libs/hipBLAS
+	url = ../hipBLAS
+[submodule "libs/hipBLASLt"]
+	path = libs/hipBLASLt
+	url = ../hipBLASLt
+[submodule "libs/rocFFT"]
+	path = libs/rocFFT
+	url = ../rocFFT
+[submodule "libs/hipFFT"]
+	path = libs/hipFFT
+	url = ../hipFFT
+[submodule "libs/rocRAND"]
+	path = libs/rocRAND
+	url = ../rocRAND
+[submodule "libs/hipRAND"]
+	path = libs/hipRAND
+	url = ../hipRAND
+[submodule "libs/rocSPARSE"]
+	path = libs/rocSPARSE
+	url = ../rocSPARSE
+[submodule "libs/hipSPARSELt"]
+	path = libs/hipSPARSELt
+	url = ../hipSPARSELt
+[submodule "libs/rocSOLVER"]
+	path = libs/rocSOLVER
+	url = ../rocSOLVER
+[submodule "libs/hipSOLVER"]
+	path = libs/hipSOLVER
+	url = ../hipSOLVER
+[submodule "libs/hipSPARSE"]
+	path = libs/hipSPARSE
+	url = ../hipSPARSE
+[submodule "libs/rocALUTION"]
+	path = libs/rocALUTION
+	url = ../rocALUTION
+[submodule "libs/rocThrust"]
+	path = libs/rocThrust
+	url = ../rocThrust
+[submodule "libs/hipCUB"]
+	path = libs/hipCUB
+	url = ../hipCUB
+[submodule "libs/rocPRIM"]
+	path = libs/rocPRIM
+	url = ../rocPRIM
+[submodule "libs/rocWMMA"]
+	path = libs/rocWMMA
+	url = ../rocWMMA
+[submodule "libs/rccl"]
+	path = libs/rccl
+	url = ../rccl
+[submodule "libs/MIOpen"]
+	path = libs/MIOpen
+	url = ../MIOpen
+[submodule "libs/composable_kernel"]
+	path = libs/composable_kernel
+	url = ../composable_kernel
+[submodule "libs/MIVisionX"]
+	path = libs/MIVisionX
+	url = ../MIVisionX
+[submodule "libs/rpp"]
+	path = libs/rpp
+	url = ../rpp
+[submodule "libs/hipfort"]
+	path = libs/hipfort
+	url = ../hipfort
+[submodule "libs/AMDMIGraphX"]
+	path = libs/AMDMIGraphX
+	url = ../AMDMIGraphX
+[submodule "libs/ROCmValidationSuite"]
+	path = libs/ROCmValidationSuite
+	url = ../ROCmValidationSuite
+[submodule "libs/openmp-extras/aomp"]
+	path = libs/openmp-extras/aomp
+	url = ../aomp
+[submodule "libs/openmp-extras/aomp-extras"]
+	path = libs/openmp-extras/aomp-extras
+	url = ../aomp-extras
+[submodule "libs/openmp-extras/flang"]
+	path = libs/openmp-extras/flang
+	url = ../flang
+[submodule "libs/rocDecode"]
+	path = libs/rocDecode
+	url = ../rocDecode
+[submodule "libs/omnitrace"]
+	path = libs/omnitrace
+	url = ../omnitrace
+[submodule "libs/omniperf"]
+	path = libs/omniperf
+	url = ../omniperf
+[submodule "libs/rocprofiler-sdk"]
+	path = libs/rocprofiler-sdk
+	url = ../rocprofiler-sdk
+[submodule "libs/rocm-examples"]
+	path = libs/rocm-examples
+	url = ../rocm-examples
+[submodule "libs/rocPyDecode"]
+	path = libs/rocPyDecode
+	url = ../rocPyDecode
+[submodule "libs/rocAL"]
+	path = libs/rocAL
+	url = ../rocAL
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -2,6 +2,7 @@ AAC
 ABI
 ACE
 ACEs
+ACS
 AccVGPR
 AccVGPRs
 ALU
@@ -12,6 +13,7 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
+APBDIS
 APIC
 APIs
 APU
@@ -24,11 +26,14 @@ ATI
 AddressSanitizer
 AlexNet
 Arb
+Autocast
+BARs
 BLAS
 BMC
-BitCode
 Blit
+Blockwise
 Bluefield
+Bootloader
 CCD
 CDNA
 CIFAR
@@ -43,6 +48,7 @@ CPF
 CPP
 CPU
 CPUs
+Cron
 CSC
 CSE
 CSV
@@ -62,7 +68,11 @@ CommonMark
 Concretized
 Conda
 ConnectX
+CuPy
+DDR
+DF
 DGEMM
+DIMM
 DKMS
 DL
 DMA
@@ -78,6 +88,7 @@ DataLoader
 DataParallel
 DeepSpeed
 Dependabot
+Deprecations
 DevCap
 Dockerfile
 Doxygen
@@ -85,13 +96,16 @@ ELMo
 ENDPGM
 EPYC
 ESXi
+EoS
 FFT
 FFTs
 FFmpeg
 FHS
 FMA
 FP
+FX
 Filesystem
+FindDb
 Flang
 Fortran
 Fuyu
@@ -124,6 +138,7 @@ GitHub
 Gitpod
 HBM
 HCA
+HGX
 HIPCC
 HIPExtension
 HIPIFY
@@ -132,13 +147,16 @@ HPCG
 HPE
 HPL
 HSA
+HW
 HWE
+HWS
 Haswell
 Higgs
 Hyperparameters
 ICV
 IDE
 IDEs
+IFWI
 IMDb
 IOMMU
 IOP
@@ -148,17 +166,21 @@ IRQ
 ISA
 ISV
 ISVs
+ITL
 ImageNet
 InfiniBand
 Inlines
 IntelliSense
+Interop
 Intersphinx
 Intra
 Ioffe
 JSON
 Jupyter
 KFD
+KFDTest
 KiB
+KV
 KVM
 Keras
 Khronos
@@ -193,6 +215,8 @@ MVFFR
 Makefile
 Makefiles
 Matplotlib
+Matrox
+Megatrends
 Megatron
 Mellanox
 Mellanox's
@@ -208,6 +232,7 @@ NIC
 NICs
 NLI
 NLP
+NPKit
 NPS
 NSP
 NUMA
@@ -225,6 +250,7 @@ OAMs
 OCP
 OEM
 OFED
+OMM
 OMP
 OMPI
 OMPT
@@ -240,29 +266,38 @@ OpenMP
 OpenMPI
 OpenSSL
 OpenVX
+OpenXLA
+PCC
 PCI
 PCIe
 PEFT
 PIL
 PILImage
+POR
 PRNG
 PRs
 PaLM
 Pageable
 PeerDirect
+PerfDb
 Perfetto
 PipelineParallel
 PnP
+PowerEdge
 PowerShell
 PyPi
 PyTorch
 Qcycles
 RAII
+RAS
 RCCL
 RDC
 RDMA
 RDNA
+README
 RHEL
+RNN
+RNNs
 ROC
 ROCProfiler
 ROCTracer
@@ -274,6 +309,7 @@ ROCm
 ROCmCC
 ROCmSoftwarePlatform
 ROCmValidationSuite
+ROCprofiler
 ROCr
 RST
 RW
@@ -288,6 +324,7 @@ SBIOS
 SCA
 SDK
 SDMA
+SDPA
 SDRAM
 SENDMSG
 SGPR
@@ -309,10 +346,12 @@ SRAMECC
 SVD
 SWE
 SerDes
+ShareGPT
 Shlens
 Skylake
 Softmax
 Spack
+SplitK
 Supermicro
 Szegedy
 TCA
@@ -323,8 +362,12 @@ TCP
 TCR
 TF
 TFLOPS
+TP
 TPU
 TPUs
+TSME
+Tagram
+TensileLite
 TensorBoard
 TensorFlow
 TensorParallel
@@ -341,10 +384,12 @@ UC
 UCC
 UCX
 UIF
+UMC
 USM
 UTCL
 UTIL
 Uncached
+Unittests
 Unhandled
 VALU
 VBIOS
@@ -399,8 +444,10 @@ backends
 benchmarking
 bfloat
 bilinear
+bitcode
 bitsandbytes
 blit
+bootloader
 boson
 bosons
 buildable
@@ -423,8 +470,10 @@ composable
 concretization
 config
 conformant
+constructible
 convolutional
 convolves
+copyable
 cpp
 csn
 cuBLAS
@@ -433,6 +482,7 @@ cuLIB
 cuRAND
 cuSOLVER
 cuSPARSE
+cTDP
 dataset
 datasets
 dataspace
@@ -445,6 +495,8 @@ denoise
 denoised
 denoises
 denormalize
+dequantization
+dequantizes
 deserializers
 detections
 dev
@@ -456,8 +508,9 @@ distro
 el
 embeddings
 enablement
-endpgm
 encodings
+endpgm
+enqueue
 env
 epilog
 etcetera
@@ -467,6 +520,8 @@ executables
 ffmpeg
 filesystem
 fortran
+fp
+gRPC
 galb
 gcc
 gdb
@@ -474,12 +529,14 @@ gfortran
 gfx
 githooks
 github
+globals
 gnupg
 grayscale
 gzip
 heterogenous
 hipBLAS
 hipBLASLt
+hipBLASLt's
 hipCUB
 hipFFT
 hipLIB
@@ -496,20 +553,24 @@ hipfort
 hipify
 hipsolver
 hipsparse
+hotspotting
 hpc
 hpp
 hsa
 hsakmt
 hyperparameter
+iDRAC
 ib_core
 inband
 incrementing
+inductor
 inferencing
 inflight
 init
 initializer
 inlining
 installable
+interop
 interprocedural
 intra
 invariants
@@ -537,6 +598,7 @@ mivisionx
 mkdir
 mlirmiopen
 mtypes
+mutex
 mvffr
 namespace
 namespaces
@@ -559,23 +621,35 @@ pragma
 pre
 prebuilt
 precompiled
+preconditioner
+preconfigured
 prefetch
 prefetchable
+prefill
+prefills
+preloaded
 preprocess
 preprocessed
 preprocessing
+preprocessor
 prequantized
 prerequisites
 profiler
+profilers
 protobuf
 pseudorandom
 py
+quantile
+quantizer
 quasirandom
 queueing
 rccl
 rdc
 reStructuredText
+redirections
+refactorization
 reformats
+repo
 repos
 representativeness
 req
@@ -587,10 +661,12 @@ roc
 rocAL
 rocALUTION
 rocBLAS
+rocDecode
 rocFFT
 rocLIB
 rocMLIR
 rocPRIM
+rocPyDecode
 rocRAND
 rocSOLVER
 rocSPARSE
@@ -628,11 +704,14 @@ spack
 src
 stochastically
 strided
+subcommand
 subdirectory
 subexpression
 subfolder
 subfolders
+submodule
 supercomputing
+symlink
 td
 tensorfloat
 th
@@ -652,6 +731,7 @@ txt
 uarch
 uncached
 uncorrectable
+unhandled
 uninstallation
 unsqueeze
 unstacking
@@ -673,10 +753,13 @@ vectorized
 vectorizer
 vectorizes
 vjxb
+voxel
 walkthrough
 walkthroughs
+watchpoints
 wavefront
 wavefronts
+whitespace
 whitespaces
 workgroup
 workgroups
@@ -685,6 +768,7 @@ writebacks
 wrreq
 wzo
 xargs
+xGMI
 xz
 yaml
 ysvmadyb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/README.md
+++ b/README.md
@@ -21,19 +21,7 @@ source software compilers, debuggers, and libraries. ROCm is fully integrated in

 ## Getting the ROCm Source Code

-AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
-
-### Installing the repo tool
-
-The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
-
-```bash
-mkdir -p ~/bin/
-curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
-chmod a+x ~/bin/repo
-```
-
-**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
+AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains submodules that point to the correct versions of each of the ROCm components. They can be found in the `/libs` directory of the ROCm repository.

 ### Installing git-lfs

@@ -45,17 +33,12 @@ sudo apt-get install git-lfs

 ### Downloading the ROCm source code

-The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
+The following example shows how to download the ROCm source from this repository.

 ```bash
-mkdir -p ~/ROCm/
-cd ~/ROCm/
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.0.x
-~/bin/repo sync
+git clone https://github.com/ROCm/ROCm -b amd/dgaliffi/submodules-6-2-0 --recurse-submodules
 ```

-**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
-
 ## Building the ROCm source code

 Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
@@ -76,9 +59,8 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai

 mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
 cd ~/WORKSPACE/
-export ROCM_VERSION=6.1.0   # or 6.1.1 6.1.2
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.1.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
+export ROCM_VERSION=6.2.0   # or 6.1.1 6.1.2
+git clone https://github.com/ROCm/ROCm -b amd/dgaliffi/submodules-${ROCM_VERSION} --recurse-submodules

 # --------------------------------------
 # Step 2: Prepare build environment
@@ -155,12 +137,6 @@ Note: [Overview for ROCm.mk](tools/rocm-build/README.md)

 ## ROCm documentation

-This repository contains the [manifest file](https://gerrit.googlesource.com/git-repo/+/HEAD/docs/manifest-format.md)
-for ROCm releases, changelogs, and release information.
-
-The `default.xml` file contains information for all repositories and the associated commit used to build
-the current ROCm release; `default.xml` uses the [Manifest Format repository](https://gerrit.googlesource.com/git-repo/).
-
 Source code for our documentation is located in the `/docs` folder of most ROCm repositories. The
 `develop` branch of our repositories contains content for the next ROCm release.

--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.1.2"
+    <default revision="refs/tags/rocm-6.2.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
@@ -10,7 +10,8 @@
    <project name="ROCR-Runtime" />
    <project name="ROCT-Thunk-Interface" />
    <project name="amdsmi" />
-    <project name="clang-ocl" />
+    <project name="omniperf" />
+    <project name="omnitrace" />
    <project name="rdc" />
    <project name="rocm_bandwidth_test" />
    <project name="rocm_smi_lib" />
@@ -18,6 +19,7 @@
    <project name="rocminfo" />
    <project name="rocprofiler" />
    <project name="rocprofiler-register" />
+    <project name="rocprofiler-sdk" />
    <project name="roctracer" />
 <!--HIP Projects-->
    <project name="HIP" />
@@ -51,9 +53,11 @@
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocAL" />
    <project groups="mathlibs" name="rocALUTION" />
    <project groups="mathlibs" name="rocBLAS" />
    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocPyDecode" />
    <project groups="mathlibs" name="rocFFT" />
    <project groups="mathlibs" name="rocPRIM" />
    <project groups="mathlibs" name="rocRAND" />
--- a/docs/about/compatibility/openmp.md
+++ b/docs/about/compatibility/openmp.md
@@ -1,482 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="OpenMP support in ROCm">
-  <meta name="keywords" content="OpenMP, LLVM, OpenMP toolchain">
-</head>
-
-# OpenMP support in ROCm
-
-## Introduction
-
-The ROCm™ installation includes an LLVM-based implementation that fully supports
-the OpenMP 4.5 standard and a subset of OpenMP 5.0, 5.1, and 5.2 standards.
-Fortran, C/C++ compilers, and corresponding runtime libraries are included.
-Along with host APIs, the OpenMP compilers support offloading code and data onto
-GPU devices. This document briefly describes the installation location of the
-OpenMP toolchain, example usage of device offloading, and usage of `rocprof`
-with OpenMP applications. The GPUs supported are the same as those supported by
-this ROCm release. See the list of supported GPUs for {doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
-{doc}`Windows<rocm-install-on-windows:reference/system-requirements>`.
-
-The ROCm OpenMP compiler is implemented using LLVM compiler technology.
-The following image illustrates the internal steps taken to translate a user’s application into an executable that can offload computation to the AMDGPU. The compilation is a two-pass process. Pass 1 compiles the application to generate the CPU code and Pass 2 links the CPU code to the AMDGPU device code.
-
-![OpenMP toolchain](../../data/reference/openmp/openmp-toolchain.svg "OpenMP toolchain")
-
-### Installation
-
-The OpenMP toolchain is automatically installed as part of the standard ROCm
-installation and is available under `/opt/rocm-{version}/llvm`. The
-sub-directories are:
-
-* bin: Compilers (`flang` and `clang`) and other binaries.
-* examples: The usage section below shows how to compile and run these programs.
-* include: Header files.
-* lib: Libraries including those required for target offload.
-* lib-debug: Debug versions of the above libraries.
-
-## OpenMP: usage
-
-The example programs can be compiled and run by pointing the environment
-variable `ROCM_PATH` to the ROCm install directory.
-
-**Example:**
-
-```bash
-export ROCM_PATH=/opt/rocm-{version}
-cd $ROCM_PATH/share/openmp-extras/examples/openmp/veccopy
-sudo make run
-```
-
-:::{note}
-`sudo` is required since we are building inside the `/opt` directory.
-Alternatively, copy the files to your home directory first.
-:::
-
-The above invocation of Make compiles and runs the program. Note the options
-that are required for target offload from an OpenMP program:
-
-```bash
-fopenmp --offload-arch=<gpu-arch>
-```
-
-:::{note}
-The compiler also accepts the alternative offloading notation:
-
-```bash
-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=<gpu-arch>
-```
-
-:::
-
-Obtain the value of `gpu-arch` by running the following command:
-
-```bash
-% /opt/rocm-{version}/bin/rocminfo | grep gfx
-```
-
-[//]: # (dated link below, needs updating)
-
-See the complete list of [compiler command-line references](https://github.com/ROCm/llvm-project/blob/amd-staging/openmp/docs/CommandLineArgumentReference.rst).
-
-### Using `rocprof` with OpenMP
-
-The following steps describe a typical workflow for using `rocprof` with OpenMP
-code compiled with AOMP:
-
-1. Run `rocprof` with the program command line:
-
-    ```bash
-    % rocprof <application> <args>
-    ```
-
-    This produces a `results.csv` file in the user’s current directory that
-    shows basic stats such as kernel names, grid size, number of registers used,
-    etc. The user can choose to specify the preferred output file name using the
-    o option.
-
-2. Add options for a detailed result:
-
-   ```bash
-   --stats: % rocprof --stats <application> <args>
-   ```
-
-   The stats option produces timestamps for the kernels. Look into the output
-   CSV file for the field, `DurationNs`, which is useful in getting an
-   understanding of the critical kernels in the code.
-
-   Apart from `--stats`, the option `--timestamp` on produces a timestamp for
-   the kernels.
-
-3. After learning about the required kernels, the user can take a detailed look
-   at each one of them. `rocprof` has support for hardware counters: a set of
-   basic and a set of derived ones. See the complete list of counters using
-   options --list-basic and --list-derived. `rocprof` accepts either a text or
-   an XML file as an input.
-
-For more details on `rocprof`, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.
-
-### Using tracing options
-
-**Prerequisite:** When using the `--sys-trace` option, compile the OpenMP
-program with:
-
-```bash
-    -Wl,-rpath,/opt/rocm-{version}/lib -lamdhip64
-```
-
-The following tracing options are widely used to generate useful information:
-
-* **`--hsa-trace`**: This option is used to get a JSON output file with the HSA
-  API execution traces and a flat profile in a CSV file.
-
-* **`--sys-trace`**: This allows programmers to trace both HIP and HSA calls.
-  Since this option results in loading ``libamdhip64.so``, follow the
-  prerequisite as mentioned above.
-
-A CSV and a JSON file are produced by the above trace options. The CSV file
-presents the data in a tabular format, and the JSON file can be visualized using
-Google Chrome at chrome://tracing/ or [Perfetto](https://perfetto.dev/).
-Navigate to Chrome or Perfetto and load the JSON file to see the timeline of the
-HSA calls.
-
-For more details on tracing, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.
-
-### Environment variables
-
-:::{table}
-:widths: auto
-| Environment Variable        | Purpose                  |
-| --------------------------- | ---------------------------- |
-| `OMP_NUM_TEAMS`             | To set the number of teams for kernel launch, which is otherwise chosen by the implementation by default. You can set this number (subject to implementation limits) for performance tuning. |
-| `LIBOMPTARGET_KERNEL_TRACE` | To print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. |
-| `LIBOMPTARGET_INFO`         | To print informational messages from the device runtime as the program executes. Setting it to a value of 1 or higher, prints fine-grain information and setting it to -1 prints complete information. |
-| `LIBOMPTARGET_DEBUG`        | To get detailed debugging information about data transfer operations and kernel launch when using a debug version of the device library. Set this environment variable to 1 to get the detailed information from the library. |
-| `GPU_MAX_HW_QUEUES`         | To set the number of HSA queues in the OpenMP runtime. The HSA queues are created on demand up to the maximum value as supplied here. The queue creation starts with a single initialized queue to avoid unnecessary allocation of resources. The provided value is capped if it exceeds the recommended, device-specific value. |
-| `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` | To set the threshold size up to which data transfers are initiated asynchronously. The default threshold size is 1*1024*1024 bytes (1MB). |
-| `OMPX_FORCE_SYNC_REGIONS` | To force the runtime to execute all operations synchronously, i.e., wait for an operation to complete immediately. This affects data transfers and kernel execution. While it is mainly designed for debugging, it may have a minor positive effect on performance in certain situations. |
-:::
-
-## OpenMP: features
-
-The OpenMP programming model is greatly enhanced with the following new features
-implemented in the past releases.
-
-(openmp_usm)=
-
-### Asynchronous behavior in OpenMP target regions
-
-* Controlling Asynchronous Behavior
-
-The OpenMP offloading runtime executes in an asynchronous fashion by default, allowing multiple data transfers to start concurrently. However, if the data to be transferred becomes larger than the default threshold of 1MB, the runtime falls back to a synchronous data transfer. The buffers that have been locked already are always executed asynchronously.
-You can overrule this default behavior by setting `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` and `OMPX_FORCE_SYNC_REGIONS`. See the [Environment Variables](#environment-variables) table for details.
-
-* Multithreaded Offloading on the Same Device
-
-The `libomptarget` plugin for GPU offloading allows creation of separate configurable HSA queues per chiplet, which enables two or more threads to concurrently offload to the same device.
-
-* Parallel Memory Copy Invocations
-
-Implicit asynchronous execution of single target region enables parallel memory copy invocations.
-
-### Unified shared memory
-
-Unified Shared Memory (USM) provides a pointer-based approach to memory
-management. To implement USM, fulfill the following system requirements along
-with Xnack capability.
-
-#### Prerequisites
-
-* Linux Kernel versions above 5.14
-* Latest KFD driver packaged in ROCm stack
-* Xnack, as USM support can only be tested with applications compiled with Xnack
-  capability
-
-#### Xnack capability
-
-When enabled, Xnack capability allows GPU threads to access CPU (system) memory,
-allocated with OS-allocators, such as `malloc`, `new`, and `mmap`. Xnack must be
-enabled both at compile- and run-time. To enable Xnack support at compile-time,
-use:
-
-```bash
--offload-arch=gfx908:xnack+
-```
-
-Or use another functionally equivalent option Xnack-any:
-
-```bash
--offload-arch=gfx908
-```
-
-To enable Xnack functionality at runtime on a per-application basis,
-use environment variable:
-
-```bash
-HSA_XNACK=1
-```
-
-When Xnack support is not needed:
-
-* Build the applications to maximize resource utilization using:
-
-```bash
--offload-arch=gfx908:xnack-
-```
-
-* At runtime, set the `HSA_XNACK` environment variable to 0.
-
-#### Unified shared memory pragma
-
-This OpenMP pragma is available on MI200 through `xnack+` support.
-
-```bash
-omp requires unified_shared_memory
-```
-
-As stated in the OpenMP specifications, this pragma makes the map clause on
-target constructs optional. By default, on MI200, all memory allocated on the
-host is fine grain. Using the map clause on a target clause is allowed, which
-transforms the access semantics of the associated memory to coarse grain.
-
-```bash
-A simple program demonstrating the use of this feature is:
-$ cat parallel_for.cpp
-#include <stdlib.h>
-#include <stdio.h>
-
-#define N 64
-#pragma omp requires unified_shared_memory
-int main() {
-  int n = N;
-  int *a = new int[n];
-  int *b = new int[n];
-
-  for(int i = 0; i < n; i++)
-    b[i] = i;
-
-  #pragma omp target parallel for map(to:b[:n])
-  for(int i = 0; i < n; i++)
-    a[i] = b[i];
-
-  for(int i = 0; i < n; i++)
-    if(a[i] != i)
-      printf("error at %d: expected %d, got %d\n", i, i+1, a[i]);
-
-  return 0;
-}
-$ clang++ -O2 -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:xnack+ parallel_for.cpp
-$ HSA_XNACK=1 ./a.out
-```
-
-In the above code example, pointer “a” is not mapped in the target region, while
-pointer “b” is. Both are valid pointers on the GPU device and passed by-value to
-the kernel implementing the target region. This means the pointer values on the
-host and the device are the same.
-
-The difference between the memory pages pointed to by these two variables is
-that the pages pointed by “a” are in fine-grain memory, while the pages pointed
-to by “b” are in coarse-grain memory during and after the execution of the
-target region. This is accomplished in the OpenMP runtime library with calls to
-the ROCr runtime to set the pages pointed by “b” as coarse grain.
-
-### OMPT target support
-
-The OpenMP runtime in ROCm implements a subset of the OMPT device APIs, as
-described in the OpenMP specification document. These APIs allow first-party
-tools to examine the profile and kernel traces that execute on a device. A tool
-can register callbacks for data transfer and kernel dispatch entry points or use
-APIs to start and stop tracing for device-related activities such as data
-transfer and kernel dispatch timings and associated metadata. If device tracing
-is enabled, trace records for device activities are collected during program
-execution and returned to the tool using the APIs described in the
-specification.
-
-The following example demonstrates how a tool uses the supported OMPT target
-APIs. The `README` in `/opt/rocm/llvm/examples/tools/ompt` outlines the steps to
-be followed, and the provided example can be run as shown below:
-
-```bash
-cd $ROCM_PATH/share/openmp-extras/examples/tools/ompt/veccopy-ompt-target-tracing
-sudo make run
-```
-
-The file `veccopy-ompt-target-tracing.c` simulates how a tool initiates device
-activity tracing. The file `callbacks.h` shows the callbacks registered and
-implemented by the tool.
-
-### Floating point atomic operations
-
-The MI200-series GPUs support the generation of hardware floating-point atomics
-using the OpenMP atomic pragma. The support includes single- and
-double-precision floating-point atomic operations. The programmer must ensure
-that the memory subjected to the atomic operation is in coarse-grain memory by
-mapping it explicitly with the help of map clauses when not implicitly mapped by
-the compiler as per the [OpenMP
-specifications](https://www.openmp.org/specifications/). This makes these
-hardware floating-point atomic instructions “fast,” as they are faster than
-using a default compare-and-swap loop scheme, but at the same time “unsafe,” as
-they are not supported on fine-grain memory. The operation in
-`unified_shared_memory` mode also requires programmers to map the memory
-explicitly when not implicitly mapped by the compiler.
-
-To request fast floating-point atomic instructions at the file level, use
-compiler flag `-munsafe-fp-atomics` or a hint clause on a specific pragma:
-
-```bash
-double a = 0.0;
-#pragma omp atomic hint(AMD_fast_fp_atomics)
-a = a + 1.0;
-```
-
-:::{note}
-`AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and
-`AMD_safe_fp_atomics` is implemented with a compare-and-swap loop.
-:::
-
-To disable the generation of fast floating-point atomic instructions at the file
-level, build using the option `-msafe-fp-atomics` or use a hint clause on a
-specific pragma:
-
-```bash
-double a = 0.0;
-#pragma omp atomic hint(AMD_safe_fp_atomics)
-a = a + 1.0;
-```
-
-The hint clause value always has a precedence over the compiler flag, which
-allows programmers to create atomic constructs with a different behavior than
-the rest of the file.
-
-See the example below, where the user builds the program using
-`-msafe-fp-atomics` to select a file-wide “safe atomic” compilation. However,
-the fast atomics hint clause over variable “a” takes precedence and operates on
-“a” using a fast/unsafe floating-point atomic, while the variable “b” in the
-absence of a hint clause is operated upon using safe floating-point atomics as
-per the compiler flag.
-
-```bash
-double a = 0.0;.
-#pragma omp atomic hint(AMD_fast_fp_atomics)
-a = a + 1.0;
-
-double b = 0.0;
-#pragma omp atomic
-b = b + 1.0;
-```
-
-### AddressSanitizer tool
-
-AddressSanitizer (ASan) is a memory error detector tool utilized by applications to
-detect various errors ranging from spatial issues such as out-of-bound access to
-temporal issues such as use-after-free. The AOMP compiler supports ASan for AMD
-GPUs with applications written in both HIP and OpenMP.
-
-**Features supported on host platform (Target x86_64):**
-
-* Use-after-free
-* Buffer overflows
-* Heap buffer overflow
-* Stack buffer overflow
-* Global buffer overflow
-* Use-after-return
-* Use-after-scope
-* Initialization order bugs
-
-**Features supported on AMDGPU platform (`amdgcn-amd-amdhsa`):**
-
-* Heap buffer overflow
-* Global buffer overflow
-
-**Software (kernel/OS) requirements:** Unified Shared Memory support with Xnack
-capability. See the section on [Unified Shared Memory](#unified-shared-memory)
-for prerequisites and details on Xnack.
-
-**Example:**
-
-* Heap buffer overflow
-
-```bash
-void  main() {
-.......  // Some program statements
-.......  // Some program statements
-#pragma omp target map(to : A[0:N], B[0:N]) map(from: C[0:N])
-{
-#pragma omp parallel for
-    for(int i =0 ; i < N; i++){
-    C[i+10] = A[i] + B[i];
-  }   // end of for loop
-}
-.......   // Some program statements
-}// end of main
-```
-
-See the complete sample code for heap buffer overflow
-[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/heap_buffer_overflow/openmp/vecadd-HBO.cpp).
-
-* Global buffer overflow
-
-```bash
-#pragma omp declare target
-   int A[N],B[N],C[N];
-#pragma omp end declare target
-void main(){
-......  // some program statements
-......  // some program statements
-#pragma omp target data map(to:A[0:N],B[0:N]) map(from: C[0:N])
-{
-#pragma omp target update to(A,B)
-#pragma omp target parallel for
-for(int i=0; i<N; i++){
-    C[i]=A[i*100]+B[i+22];
-} // end of for loop
-#pragma omp target update from(C)
-}
-........  // some program statements
-} // end of main
-```
-
-See the complete sample code for global buffer overflow
-[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/global_buffer_overflow/openmp/vecadd-GBO.cpp).
-
-### Clang compiler option for kernel optimization
-
-You can use the clang compiler option `-fopenmp-target-fast` for kernel optimization if certain constraints implied by its component options are satisfied. `-fopenmp-target-fast` enables the following options:
-
-* `-fopenmp-target-ignore-env-vars`: It enables code generation of specialized kernels including no-loop and Cross-team reductions.
-
-* `-fopenmp-assume-no-thread-state`: It enables the compiler to assume that no thread in a parallel region modifies an Internal Control Variable (`ICV`), thus potentially reducing the device runtime code execution.
-
-* `-fopenmp-assume-no-nested-parallelism`: It enables the compiler to assume that no thread in a parallel region encounters a parallel region, thus potentially reducing the device runtime code execution.
-
-* `-O3` if no `-O*` is specified by the user.
-
-### Specialized kernels
-
-Clang will attempt to generate specialized kernels based on compiler options and OpenMP constructs. The following specialized kernels are supported:
-
-* No-loop
-* Big-jump-loop
-* Cross-team reductions
-
-To enable the generation of specialized kernels, follow these guidelines:
-
-* Do not specify teams, threads, and schedule-related environment variables. The `num_teams` clause in an OpenMP target construct acts as an override and prevents the generation of the no-loop kernel. If the specification of `num_teams` clause is a user requirement then clang tries to generate the big-jump-loop kernel instead of the no-loop kernel.
-
-* Assert the absence of the teams, threads, and schedule-related environment variables by adding the command-line option `-fopenmp-target-ignore-env-vars`.
-
-* To automatically enable the specialized kernel generation, use `-Ofast` or `-fopenmp-target-fast` for compilation.
-
-* To disable specialized kernel generation, use `-fno-openmp-target-ignore-env-vars`.
-
-#### No-loop kernel generation
-
-The no-loop kernel generation feature optimizes the compiler performance by generating a specialized kernel for certain OpenMP target constructs such as target teams distribute parallel for. The specialized kernel generation feature assumes every thread executes a single iteration of the user loop, which leads the runtime to launch a total number of GPU threads equal to or greater than the iteration space size of the target region loop. This allows the compiler to generate code for the loop body without an enclosing loop, resulting in reduced control-flow complexity and potentially better performance.
-
-#### Big-jump-loop kernel generation
-
-A no-loop kernel is not generated if the OpenMP teams construct uses a `num_teams` clause. Instead, the compiler attempts to generate a different specialized kernel called the big-jump-loop kernel. The compiler launches the kernel with a grid size determined by the number of teams specified by the OpenMP `num_teams` clause and the `blocksize` chosen either by the compiler or specified by the corresponding OpenMP clause.
-
-#### Cross-team optimized reduction kernel generation
-
-If the OpenMP construct has a reduction clause, the compiler attempts to generate optimized code by utilizing efficient cross-team communication. New APIs for cross-team reduction are implemented in the device runtime and are automatically generated by clang.
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -25,66 +25,69 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
-| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
-| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
-| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
-| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [AMD Common Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/develop/LICENCE) |
-| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
-| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
-| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
-| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
+| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
-| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
-| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
-| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
-| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
-| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
-| [ROCR Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
+| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
+| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
 | [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
 | [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
+| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
 | [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
 | [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
-| [hipFORT](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
+| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
+| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
 | [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
 | [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
 | [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
 | [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
 | [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
+| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html) |
+| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
+| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
+| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
+| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
+| [Omniperf](https://github.com/ROCm/omniperf) | [MIT](https://github.com/ROCm/omniperf/blob/main/LICENSE) |
+| [Omnitrace](https://github.com/ROCm/omnitrace) | [MIT](https://github.com/ROCm/omnitrace/blob/main/LICENSE) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
 | [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
+| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
 | [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
 | [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
-| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
+| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
+| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
+| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
+| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
+| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
+| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
+| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
+| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
+| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
+| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
+| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
+| [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
+| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
 | [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
 | [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
 | [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
-| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
-| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
-| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
-| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
-| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
-| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
-| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
 | [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
-| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
+| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
+| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
 | [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
-| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
-| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html)

 Open sourced ROCm components are released via public GitHub
 repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -8,121 +8,169 @@ Compatibility matrix

 Use this matrix to view the ROCm compatibility across successive major and minor releases.

+You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.

 .. container:: format-big-table

  .. csv-table:: 
-      :header: "ROCm Version", "6.1.0", "6.0.0"
+      :header: "ROCm Version", "6.2.0", "6.1.2", "6.0.0"
      :stub-columns: 1

-      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
-      ,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
-      ,"RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,"SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 8.9 [#oracle89]_"
-      ,,
-      :doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3
-      ,CDNA2,CDNA2
-      ,CDNA,CDNA
-      ,RDNA3,RDNA3
-      ,RDNA2,RDNA2
-      ,,
-      :doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100
-      ,gfx1030,gfx1030
-      ,gfx942 [#]_, gfx942 [#]_
-      ,gfx90a,gfx90a
-      ,gfx908,gfx908
-      ,,
-      ECOSYSTEM SUPPORT:,,
-      :doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.15, 2.14, 2.13","2.14, 2.13, 2.12"
-      :doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.14.1
-      ,,
-      3RD PARTY COMMUNICATION LIBS:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1
-      ,,
-      3RD PARTY ALGORITHM LIBS:,,
-      Thrust,2.1.0,2.0.1
-      CUB,2.1.0,2.0.1
-      ,,
-      ML & COMPUTER VISION LIBS:,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.1.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,2.5.0,2.5.0
-      :doc:`rocDecode <rocdecode:index>`,0.5.0,N/A
-      :doc:`ROCm Performance Primitives (RPP) <rpp:index>`,1.5.0,1.4.0
-      ,,
-      COMMUNICATION:,,
-      :doc:`RCCL <rccl:index>`,2.18.6,2.18.3
-      ,,
-      MATH LIBS:,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.1.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.13
-      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.1.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.1.1,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.1.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.27,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.0.1,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.4.0,1.3.0
-      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.39.0
-      ,,
-      PRIMITIVES:,,
-      :doc:`hipCUB <hipcub:index>`,3.1.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.2.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.1.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.0
-      ,,
-      SUPPORT LIBS:,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.1.40091,6.0.32830
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.12.0,0.11.0
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.0,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.3.30,20231016.2.245
-      ,,
-      TOOLS:,,
-      :doc:`AMD SMI <amdsmi:index>`,24.4.1,23.4.2
-      :doc:`HIPIFY <hipify:index>`,17.0.0,17.0.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60100,2.0.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60100,4.1.0
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.1.0,13.2.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.0.0,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.0,rocm-6.0.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3
-      :doc:`TransferBench <transferbench:index>`,1.48,1.46
-      ,,
-      COMPILERS:,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0
-      `Flang <https://github.com/ROCm/flang>`_,17.0.0.24103,17.0.0.23483
-      `llvm-project <https://github.com/ROCm/llvm-project>`_,17.0.0.24103,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24103,17.0.0.23483
-      ,,
-      RUNTIMES:,,
-      :doc:`HIP <hip:index>`,6.1.40091,6.0.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0
-      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.12.0
+      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 24.04","",""
+      ,"Ubuntu 22.04.5 [#Ubuntu220405]_, 22.04.4","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
+      ,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.4, 9.3","RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 8.9 [#oracle89]_","Oracle Linux 8.9 [#oracle89]_",""
+      ,".. _architecture-support-compatibility-matrix:",,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA
+      ,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2
+      ,".. _gpu-support-compatibility-matrix:",,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030
+      ,gfx942 [#mi300_620]_, gfx942 [#mi300_612]_, gfx942 [#mi300_600]_
+      ,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908
+      ,,,
+      FRAMEWORK SUPPORT,".. _framework-support-compatibility-matrix:",,
+      :doc:`PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>`,"2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <rocm-install-on-linux:install/3rd-party/tensorflow-install>`,"2.16.1, 2.15.1, 2.14.1","2.15, 2.14, 2.13","2.14, 2.13, 2.12"
+      :doc:`JAX <rocm-install-on-linux:install/3rd-party/jax-install>`,0.4.26,0.4.26,0.4.26
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.14.1
+      ,,,
+      THIRD PARTY COMMS,".. _thirdpartycomms-support-compatibility-matrix:",,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.14.1,>=1.14.1
+      ,,,
+      THIRD PARTY ALGORITHM,".. _thirdpartyalgorithm-support-compatibility-matrix:",,
+      Thrust,2.2.0,2.1.0,2.0.1
+      CUB,2.2.0,2.1.0,2.0.1
+      ,,,
+      ML & COMPUTER VISION,".. _mllibs-support-compatibility-matrix:",,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.10.0,2.9.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.2.0,3.1.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.0.0,2.5.0,2.5.0
+      :doc:`rocDecode <rocdecode:index>`,0.6.0,0.6.0,N/A
+      :doc:`RPP <rpp:index>`,1.8.0,1.5.0,1.4.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.1.0,N/A,N/A
+      ,,,
+      COMMUNICATION,".. _commlibs-support-compatibility-matrix:",,
+      :doc:`RCCL <rccl:index>`,2.20.5,2.18.6,2.18.3
+      ,,,
+      MATH LIBS,".. _mathlibs-support-compatibility-matrix:",,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.2.0,2.1.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.8.0,0.7.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.14,1.0.13
+      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.11.0,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.2.0,2.1.1,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.1.1,3.0.1,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.1,0.2.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.0,3.1.1,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.2.0,4.1.2,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.28,1.0.27,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.1.0,3.0.1,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.26.0,3.25.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.2.0,3.1.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.5.0,1.4.0,1.3.0
+      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.40.0,4.39.0
+      ,,,
+      PRIMITIVES,".. _primitivelibs-support-compatibility-matrix:",,
+      :doc:`hipCUB <hipcub:index>`,3.2.0,3.1.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.3.0,1.2.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.2.0,3.1.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.1,3.0.0
+      ,,,
+      SUPPORT LIBS,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.2.41133,6.1.40093,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.2.0,6.1.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240607.1.4246,20240125.5.08,20231016.2.245
+      ,,,
+      SYSTEM MGMT TOOLS,".. _tools-support-compatibility-matrix:",,
+      :doc:`AMD SMI <amdsmi:index>`,24.6.2,24.5.1,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.0.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.3.0,7.2.0,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.2.0,rocm-6.1.2,rocm-6.0.0
+      ,,,
+      PERFORMANCE TOOLS,,,
+      :doc:`Omniperf <omniperf:index>`,2.0.1,N/A,N/A
+      :doc:`Omnitrace <omnitrace:index>`,1.11.2,N/A,N/A
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60200,2.0.60102,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.4.0,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60200,4.1.60102,4.1.60000
+      ,,,
+      DEVELOPMENT TOOLS,,,
+      :doc:`HIPIFY <hipify:index>`,18.0.0.24232,17.0.0.24193,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.13.0,0.12.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.76.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.2.0,14.1.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.3.0,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3
+      ,,,
+      COMPILERS,".. _compilers-support-compatibility-matrix:",,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,0.5.0,0.5.0
+      `Flang <https://github.com/ROCm/flang>`_,18.0.0.24232,17.0.0.24193,17.0.0.23483
+      `llvm-project <https://github.com/ROCm/llvm-project>`_,18.0.0.24232,17.0.0.24193,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,18.0.0.24232,17.0.0.24193,17.0.0.23483
+      ,,,
+      RUNTIMES,".. _runtime-support-compatibility-matrix:",,
+      :doc:`HIP <hip:index>`,6.2.41133,6.1.40093,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
+      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.13.0,1.12.0


 .. rubric:: Footnotes

-.. [#red-hat94] **For ROCm 6.1** - RHEL 9.4 is supported only on AMD Instinct MI300A.
-.. [#oracle89] **For ROCm 6.1.1** - Oracle Linux is supported only on AMD Instinct MI300X.
-.. [#] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
-.. [#] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+.. [#Ubuntu220405] Preview support of Ubuntu 22.04.5 only
+.. [#red-hat94] RHEL 9.4 is supported only on AMD Instinct MI300A.
+.. [#oracle89] Oracle Linux is supported only on AMD Instinct MI300X.
+.. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+.. [#mi300_612] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+.. [#mi300_600] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+
+..
+   Footnotes and ref anchors in below historical tables should be appended with "-past-60", to differentiate from the 
+   footnote references in the above, latest, compatibility matrix.  It also allows to easily find & replace.
+   An easy way to work is to download the historical.CSV file, and update open it in excel. Then when content is ready, 
+   delete the columns you don't need, to build the current compatibility matrix to use in above table.  Find & replace all
+   instances of "-past-60" to make it ready for above table.
+
+
+.. _past-rocm-compatibility-matrix:
+
+Past versions of ROCm compatibility matrix
+***************************************************
+
+Expand for full historical view of:
+
+.. dropdown:: ROCm 6.0 - Present
+
+   You can `download the entire .csv <../downloads/compatibility-matrix-historical-6.0.csv>`_ for offline reference.
+
+   .. csv-table::
+      :file: ../data/reference/compatibility-matrix-historical-6.0.csv
+      :widths: 20,10,10,10,10,10,10
+      :header-rows: 1
+      :stub-columns: 1
+   
+   .. rubric:: Footnotes
+
+   .. [#Ubuntu220405-past-60] Preview support of Ubuntu 22.04.5 only
+   .. [#red-hat94-past-60] RHEL 9.4 is supported only on AMD Instinct MI300A.
+   .. [#oracle89-past-60] Oracle Linux is supported only on AMD Instinct MI300X.
+   .. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
+   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.

--- a/docs/compatibility/precision-support.rst
+++ b/docs/compatibility/precision-support.rst
@@ -416,7 +416,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
        - -/✅
        - -/✅
        - -/✅
@@ -428,7 +428,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
@@ -474,7 +474,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
        - -/❌
        - -/❌
        - -/✅
@@ -492,7 +492,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
--- a/docs/conceptual/ai-pytorch-inception.md
+++ b/docs/conceptual/ai-pytorch-inception.md
@@ -65,7 +65,7 @@ This example is adapted from the PyTorch research hub page on [Inception V3](htt

 Follow these steps:

-1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker run -it -v $HOME:/data --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest
@@ -155,7 +155,7 @@ The previous section focused on downloading and using the Inception V3 model for

 Follow these steps:

-1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker pull rocm/pytorch:latest
--- a/docs/conceptual/compiler-disambiguation.md
+++ b/docs/conceptual/compiler-disambiguation.md
@@ -1,21 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="ROCm compilers disambiguation">
-  <meta name="keywords" content="compilers, compiler naming, AMD, ROCm">
-</head>
-
-# ROCm compilers disambiguation
-
-ROCm ships multiple compilers of varying origins and purposes. This article
-disambiguates compiler naming used throughout the documentation.
-
-## Compiler terms
-
-| Term | Description |
-| - | - |
-| `amdclang++` | Clang/LLVM-based compiler that is part of `rocm-llvm` package. The source code is available at <a href="https://github.com/ROCm/llvm-project" target="_blank">https://github.com/ROCm/llvm-project</a>. |
-| AOCC | Closed-source clang-based compiler that includes additional CPU optimizations. Offered as part of ROCm via the `rocm-llvm-alt` package. See for details, <a href="https://developer.amd.com/amd-aocc/" target="_blank">https://developer.amd.com/amd-aocc/</a>. |
-| HIP-Clang | Informal term for the `amdclang++` compiler |
-| HIPIFY | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm/HIPIFY" target="_blank">https://github.com/ROCm/HIPIFY</a> |
-| `hipcc` | HIP compiler driver. A utility that invokes `clang` or `nvcc` depending on the target and passes the appropriate include and library options for the target compiler and HIP infrastructure. The source code is available at <a href="https://github.com/ROCm/HIPCC" target="_blank">https://github.com/ROCm/HIPCC</a>. |
-| ROCmCC | Clang/LLVM-based compiler. ROCmCC in itself is not a binary but refers to the overall compiler. |
--- a/docs/conceptual/compiler-topics.md
+++ b/docs/conceptual/compiler-topics.md
@@ -9,6 +9,6 @@

 The following topics describe using specific features of the compilation tools:

-* [Using AddressSanitizer](./using-gpu-sanitizer.md)
-* [Compiler disambiguation](./compiler-disambiguation.md)
-* [OpenMP support in ROCm](../about/compatibility/openmp.md)
+* [ROCm compiler infrastructure](https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html)
+* [Using AddressSanitizer](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html)
+* [OpenMP support](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html)
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -10,7 +10,7 @@ GPU computational elements of the processor along with the lower levels of the c

 The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.

-```{figure} ../../data/conceptual/gpu-arch/image007.png
+```{figure} ../../data/shared/xcd-sys-arch.png
 ---
 name: mi300-xcd
 align: center
@@ -103,7 +103,7 @@ MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, w

 ## Node-level architecture

-```{figure} ../../data/conceptual/gpu-arch/image009.png
+```{figure} ../../data/shared/mi300-node-level-arch.png
 ---
 name: mi300-node

--- a/docs/conceptual/gpu-memory.md
+++ b/docs/conceptual/gpu-memory.md
@@ -51,7 +51,7 @@ In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault
 There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
 On MI200 these options do not impact performance.
 <!-- TODO: link to programming_manual#memory-allocation-flags -->
-For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
+For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:how-to/programming_manual`.
 :::

 Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -1,427 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="Using the LLVM ASan on a GPU">
-  <meta name="keywords" content="LLVM, ASan, address sanitizer, AddressSanitizer, instrumented
-  libraries, instrumented applications, AMD, ROCm">
-</head>
-
-# Using the AddressSanitizer on a GPU (beta release)
-
-The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.
-Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
-This document provides documentation on using ROCm ASan.
-
-For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).
-
-:::{note}
-The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
-:::
-
-## Compiling for ASan
-
-The ASan process begins by compiling the application of interest with the ASan instrumentation.
-
-Recommendations for doing this are:
-
-* Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
-* Add the following options to the existing compiler and linker options:
-  
-  * `-fsanitize=address` - enables instrumentation
-
-  * `-shared-libsan` - use shared version of runtime
-
-  * `-g` - add debug info for improved reporting
-
-* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
-
-Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.
-
-:::{tip}
-It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
-:::
-
-:::{note}
-When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
-:::
-
-### About compilation time
-
-When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all downstream components of the compiler toolchain and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances raised the compile time to an unacceptable level.
-
-There are a few options if the compile time becomes unacceptable:
-
-* Avoid instrumentation of the files which have the worst compile times. This will reduce the effectiveness of the ASan process.
-* Add the option `-fsanitize-recover=address` to the compiles with the worst compile times. This option simplifies the added instrumentation resulting in faster compilation. See below for more information.
-* Disable instrumentation on a per-function basis by adding `__attribute__`((no_sanitize("address"))) to functions found to be responsible for the large compile time. Again, this will reduce the effectiveness of the process.
-
-## Installing ROCm GPU ASan packages
-
-For a complete ROCm GPU Sanitizer installation, including packages, instrumented HSA and HIP runtimes, tools, and math libraries, use the following instruction,
-
-```bash
-    sudo apt-get install rocm-ml-sdk-asan
-
-```
-
-## Using AMD-supplied ASan instrumented libraries
-
-ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
-These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with `g++`. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
-
-These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
-
-When adjusting an application build to add instrumentation, linking against these instrumented libraries is unnecessary. For example, any `-L` `/opt/rocm-<version>/lib` compiler options need not be changed. However, the instrumented libraries should be used when the application is run. It is particularly important that the instrumented language runtimes, like `libamdhip64.so` and `librocm-core.so`, are used; otherwise, device invalid access detections may not be reported.
-
-## Running ASan instrumented applications
-
-### Preparing to run an instrumented application
-
-Here are a few recommendations to consider before running an ASan instrumented heterogeneous application.
-
-* Ensure the Linux kernel running on the system has Heterogeneous Memory Management (HMM) support. A kernel version of 5.6 or higher should be sufficient.
-* Ensure XNACK is enabled
-  * For `gfx90a` (MI-2X0) or `gfx940` (MI-3X0) use environment `HSA_XNACK = 1`.
-  * For `gfx906` (MI-50) or `gfx908` (MI-100) use environment `HSA_XNACK = 1` but also ensure the amdgpu kernel module is loaded with module argument `noretry=0`.
-This requirement is due to the fact that the XNACK setting for these GPUs is system-wide.
-
-* Ensure that the application will use the instrumented libraries when it runs. The output from the shell command `ldd <application name>` can be used to see which libraries will be used.
-If the instrumented libraries are not listed by `ldd`, the environment variable `LD_LIBRARY_PATH` may need to be adjusted, or in some cases an `RPATH` compiled into the application may need to be changed and the application recompiled.
-
-* Ensure that the application depends on the ASan runtime. This can be checked by running the command `readelf -d <application name> | grep NEEDED` and verifying that shared library: `libclang_rt.asan-x86_64.so` appears in the output.
-If it does not appear, when executed the application will quickly output an ASan error that looks like:
-
-```bash
-==3210==ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
-```
-
-* Ensure that the application `llvm-symbolizer` can be executed, and that it is located in `/opt/rocm-<version>/llvm/bin`. This executable is not strictly required, but if found is used to translate ("symbolize") a host-side instruction address into a more useful function name, file name, and line number (assuming the application has been built to include debug information).
-
-There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.
-
-There are three `ASAN_OPTION` flags of note.
-
-* `halt_on_error=0/1 default 1`.
-
-  This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
-
-* `detect_leaks=0/1 default 1`.
-
-  This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
-
-* `quarantine_size_mb=N default 256`
-
-  This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
-
-  :::{note}
-  Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
-  :::
-
-## Runtime overhead
-
-Running an ASan instrumented application incurs
-overheads which may result in unacceptably long runtimes
-or failure to run at all.
-
-### Higher execution time
-
-ASan detection works by checking each address at runtime
-before the address is actually accessed by a load, store, or atomic
-instruction.
-This checking involves an additional load to "shadow" memory which
-records whether the address is "poisoned" or not, and additional logic
-that decides whether to produce an detection report or not.
-
-This extra runtime work can cause the application to slow down by
-a factor of three or more, depending on how many memory accesses are
-executed.
-For heterogeneous applications, the shadow memory must be accessible by all devices
-and this can mean that shadow accesses from some devices may be more costly
-than non-shadow accesses.
-
-### Higher memory use
-
-The address checking described above relies on the compiler to surround
-each program variable with a red zone and on ASan
-runtime to surround each runtime memory allocation with a red zone and
-fill the shadow corresponding to each red zone with poison.
-The added memory for the red zones is additional overhead on top
-of the 13% overhead for the shadow memory itself.
-
-Applications which consume most one or more available memory pools when
-run normally are likely to encounter allocation failures when run with
-instrumentation.
-
-## Runtime reporting
-
-It is not the intention of this document to provide a detailed explanation of all the types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.
-
-An invalid address detection report for the CPU always starts with
-
-```bash
-==<PID>==ERROR: AddressSanitizer: <problem type> on address <memory address> at pc <pc> bp <bp> sp <sp> <access> of size <N> at <memory address> thread T0
-```
-
-and continues with a stack trace for the access, a stack trace for the allocation and deallocation, if relevant, and a dump of the shadow near the <memory address>.
-
-In contrast, an invalid address detection report for the GPU always starts with
-
-```bash
-==<PID>==ERROR: AddressSanitizer: <problem type> on amdgpu device <device> at pc <pc> <access> of size <n> in workgroup id (<X>,<Y>,<Z>)
-```
-
-Above, `<device>` is the integer device ID, and `(<X>, <Y>, <Z>)` is the ID of the workgroup or block where the invalid address was detected.
-
-While the CPU report include a call stack for the thread attempting the invalid access, the GPU is currently to a call stack of size one, i.e. the (symbolized) of the invalid access, e.g.
-
-```bash
-#0 <pc> in <fuction signature> at /path/to/file.hip:<line>:<column>
-```
-
-This short call stack is followed by a GPU unique section that looks like
-
-```bash
-Thread ids and accessed addresses:
-<lid0> <maddr 0> : <lid1> <maddr1> : ...
-```
-
-where each `<lid j> <maddr j>` indicates the lane ID and the invalid memory address held by lane `j` of the wavefront attempting the invalid access.
-
-Additionally, reports for invalid GPU accesses to memory allocated by GPU code via `malloc` or new starting with, for example,
-
-```bash
-==1234==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fa9f5c92dcc
-```
-
-or
-
-```bash
-==5678==ERROR: AddressSanitizer: heap-use-after-free on amdgpu device 3 at pc 0x7f4c10062d74
-```
-
-currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.
-
-## Running ASan with `rocgdb`
-
-`rocgdb` can be used to further investigate ASan detected errors, with some preparation.
-
-Currently, the ASan runtime complains when starting `rocgdb` without preparation.
-
-```bash
-$ rocgdb my_app
-==1122==ASan` runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
-```
-
-This is solved by setting environment variable `LD_PRELOAD` to the path to the ASan runtime, whose path can be obtained using the command
-
-```bash
-amdclang++ -print-file-name=libclang_rt.asan-x86_64.so
-```
-
-You should also set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.
-
-After starting `rocgdb` breakpoints can be set on the ASan runtime error reporting entry points of interest. For example, if an ASan error report includes
-
-```bash
-WRITE of size 4 in workgroup id (10,0,0)
-```
-
-the `rocgdb` command needed to stop the program before the report is printed is
-
-```bash
-(gdb) break __asan_report_store4
-```
-
-Similarly, the appropriate command for a report including
-
-```bash
-READ of size <N> in workgroup ID (1,2,3)
-```
-
-is
-
-```bash
-(gdb) break __asan_report_load<N>
-```
-
-It is possible to set breakpoints on all ASan report functions using these commands:
-
-```bash
-$ rocgdb <path to application>
-(gdb) start <commmand line arguments>
-(gdb) rbreak ^__asan_report
-(gdb) c
-```
-
-## Using ASan with a short HIP application
-
-Consider the following simple and short demo of using the Address Sanitizer with a HIP application:
-
-```C++
-
-#include <cstdlib>
-#include <hip/hip_runtime.h>
-
-__global__ void
-set1(int *p)
-{
-    int i = blockDim.x*blockIdx.x + threadIdx.x;
-    p[i] = 1;
-}
-
-int
-main(int argc, char **argv)
-{
-    int m = std::atoi(argv[1]);
-    int n1 = std::atoi(argv[2]);
-    int n2 = std::atoi(argv[3]);
-    int c = std::atoi(argv[4]);
-    int *dp;
-    hipMalloc(&dp, m*sizeof(int));
-    hipLaunchKernelGGL(set1, dim3(n1), dim3(n2), 0, 0, dp);
-    int *hp = (int*)malloc(c * sizeof(int));
-    hipMemcpy(hp, dp, m*sizeof(int), hipMemcpyDeviceToHost);
-    hipDeviceSynchronize();
-    hipFree(dp);
-    free(hp);
-    std::puts("Done.");
-    return 0;
-}
-```
-
-This application will attempt to access invalid addresses for certain command line arguments. In particular, if `m < n1 * n2` some device threads will attempt to access
-unallocated device memory.
-
-Or, if `c < m`, the `hipMemcpy` function will copy past the end of the `malloc` allocated memory.
-
-**Note**: The `hipcc` compiler is used here for simplicity.
-
-Compiling without XNACK results in a warning.
-
-```bash
-$ hipcc -g --offload-arch=gfx90a:xnack- -fsanitize=address -shared-libsan mini.hip -o mini
-clang++: warning: ignoring` `-fsanitize=address' option for offload arch 'gfx90a:xnack-`, as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored]`.
-```
-
-The binary compiled above will run, but the GPU code will not be instrumented and the `m < n1 * n2` error will not be detected. Switching to `--offload-arch=gfx90a:xnack+` in the command above results in a warning-free compilation and an instrumented application. After setting `PATH`, `LD_LIBRARY_PATH` and `HSA_XNACK` as described earlier, a check of the binary with `ldd` yields the following,
-
-```bash
-$ ldd mini
-        linux-vdso.so.1 (0x00007ffd1a5ae000)
-        libclang_rt.asan-x86_64.so => /opt/rocm-6.1.0-99999/llvm/lib/clang/17.0.0/lib/linux/libclang_rt.asan-x86_64.so (0x00007fb9c14b6000)
-        libamdhip64.so.5 => /opt/rocm-6.1.0-99999/lib/asan/libamdhip64.so.5 (0x00007fb9bedd3000)
-        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fb9beba8000)
-        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fb9bea59000)
-        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fb9bea3e000)
-        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fb9be84a000)
-        libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fb9be844000)
-        libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fb9be821000)
-        librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fb9be817000)
-        libamd_comgr.so.2 => /opt/rocm-6.1.0-99999/lib/asan/libamd_comgr.so.2 (0x00007fb9b4382000)
-        libhsa-runtime64.so.1 => /opt/rocm-6.1.0-99999/lib/asan/libhsa-runtime64.so.1 (0x00007fb9b3b00000)
-        libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fb9b3af3000)
-        /lib64/ld-linux-x86-64.so.2 (0x00007fb9c2027000)
-        libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fb9b3ad7000)
-        libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fb9b3aa7000)
-        libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fb9b3a89000)
-        libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fb9b3a70000)
-        libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fb9b3a62000)
-
-```
-
-This confirms that the address sanitizer runtime is linked in, and the ASan instrumented version of the runtime libraries are used.
-Checking the `PATH` yields
-
-```bash
-$ which llvm-symbolizer
-/opt/rocm-6.1.0-99999/llvm/bin/llvm-symbolizer
-```
-
-Lastly, a check of the OS kernel version yields
-
-```bash
-$ uname -rv
-5.15.0-73-generic #80~20.04.1-Ubuntu SMP Wed May 17 14:58:14 UTC 2023
-```
-
-which indicates that the required HMM support (kernel version > 5.6) is available. This completes the necessary setup. Running with `m = 100`, `n1 = 11`, `n2 = 10` and `c = 100` should produce
-a report for an invalid access by the last 10 threads.
-
-```bash
-=================================================================
-==3141==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fb1410d2cc4
-WRITE of size 4 in workgroup id (10,0,0)
-  #0 0x7fb1410d2cc4 in set1(int*) at /home/dave/mini/mini.cpp:0:10
-
-Thread ids and accessed addresses:
-00 : 0x7fb14371d190 01 : 0x7fb14371d194 02 : 0x7fb14371d198 03 : 0x7fb14371d19c 04 : 0x7fb14371d1a0 05 : 0x7fb14371d1a4 06 : 0x7fb14371d1a8 07 : 0x7fb14371d1ac
-08 : 0x7fb14371d1b0 09 : 0x7fb14371d1b4
-
-0x7fb14371d190 is located 0 bytes after 400-byte region [0x7fb14371d000,0x7fb14371d190)
-allocated by thread T0 here:
-    #0 0x7fb151c76828 in hsa_amd_memory_pool_allocate /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:692:3
-    #1 ...
-
-    #12 0x7fb14fb99ec4 in hipMalloc /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:568:3
-    #13 0x226630 in hipError_t hipMalloc<int>(int**, unsigned long) /opt/rocm-6.1.0-99999/include/hip/hip_runtime_api.h:8367:12
-    #14 0x226630 in main /home/dave/mini/mini.cpp:19:5
-    #15 0x7fb14ef02082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
-
-Shadow bytes around the buggy address:
-  0x7fb14371cf00: ...
-
-=>0x7fb14371d180: 00 00[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa
-  0x7fb14371d200: ...
-
-Shadow byte legend (one shadow byte represents 8 application bytes):
-  Addressable:           00
-  Partially addressable: 01 02 03 04 05 06 07
-  Heap left redzone:       fa
-  ...
-==3141==ABORTING
-```
-
-Running with `m = 100`, `n1 = 10`, `n2 = 10` and `c = 99` should produce a report for an invalid copy.
-
-```shell
-=================================================================
-==2817==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x514000150dcc at pc 0x7f5509551aca bp 0x7ffc90a7ae50 sp 0x7ffc90a7a610
-WRITE of size 400 at 0x514000150dcc thread T0
-    #0 0x7f5509551ac9 in __asan_memcpy /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3
-    #1 ...
-
-    #9 0x7f5507462a28 in hipMemcpy_common(void*, void const*, unsigned long, hipMemcpyKind, ihipStream_t*) /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:637:10
-    #10 0x7f5507464205 in hipMemcpy /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:642:3
-    #11 0x226844 in main /home/dave/mini/mini.cpp:22:5
-    #12 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
-    #13 0x22605d in _start (/home/dave/mini/mini+0x22605d)
-
-0x514000150dcc is located 0 bytes after 396-byte region [0x514000150c40,0x514000150dcc)
-allocated by thread T0 here:
-    #0 0x7f5509553dcf in malloc /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
-    #1 0x226817 in main /home/dave/mini/mini.cpp:21:21
-    #2 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
-
-SUMMARY: AddressSanitizer: heap-buffer-overflow /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3 in __asan_memcpy
-Shadow bytes around the buggy address:
-  0x514000150b00: ...
-
-=>0x514000150d80: 00 00 00 00 00 00 00 00 00[04]fa fa fa fa fa fa
-  0x514000150e00: ...
-
-Shadow byte legend (one shadow byte represents 8 application bytes):
-  Addressable:           00
-  Partially addressable: 01 02 03 04 05 06 07
-  Heap left redzone:       fa
-  ...
-==2817==ABORTING
-```
-
-## Known issues with using GPU sanitizer
-
-* Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.
-
-* Lack of detection or false reports can be caused by the runtime not properly maintaining red zone shadows.
-
-* Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.
-
-* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,26 +4,13 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

-import shutil
-import jinja2
 import os
+import shutil

-# Environment to process Jinja templates.
-jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
+shutil.copy2("../RELEASE.md", "./about/release-notes.md")

-# Jinja templates to render out.
-templates = []
-
-# Render templates and output files without the last extension.
-# For example: 'install.md.jinja' becomes 'install.md'.
-for template in templates:
-    rendered = jinja_env.get_template(template).render()
-    with open(os.path.splitext(template)[0], 'w') as file:
-        file.write(rendered)
-
-shutil.copy2('../RELEASE.md','./about/release-notes.md')
-# Keep capitalization due to similar linking on GitHub's markdown preview.
-shutil.copy2('../CHANGELOG.md','./about/changelog.md')
+os.system("mkdir -p ../_readthedocs/html/downloads")
+os.system("cp data/reference/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")

 latex_engine = "xelatex"
 latex_elements = {
@@ -34,55 +21,100 @@ latex_elements = {
 """
 }

+html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
+html_context = {}
+if os.environ.get("READTHEDOCS", "") == "True":
+    html_context["READTHEDOCS"] = True
+
 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.1.2"
-release = "6.1.2"
+version = "6.2.0"
+release = "6.2.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
+    {"file": "about/release-notes", "os": ["linux", "windows"], "date": "2024-08-02"},
+    {"file": "about/changelog", "os": ["linux", "windows"], "date": "2024-08-02"},
+    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/train-a-model", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/deploy-your-model", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/hugging-face-models", "os": ["linux"]},
+    {"file": "how-to/rocm-for-hpc/index", "os": ["linux"]},
+    {"file": "how-to/llm-fine-tuning-optimization/index", "os": ["linux"]},
+    {"file": "how-to/llm-fine-tuning-optimization/overview", "os": ["linux"]},
    {
-        "file":"about/release-notes",
-        "os":["linux", "windows"],
-        "date":"2024-06-04"
+        "file": "how-to/llm-fine-tuning-optimization/fine-tuning-and-inference",
+        "os": ["linux"],
    },
    {
-        "file":"about/changelog",
-        "os":["linux", "windows"],
-        "date":"2024-06-04"
+        "file": "how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference",
+        "os": ["linux"],
    },
-
-    {"file":"how-to/deep-learning-rocm", "os":["linux"]},
-    {"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
-    {"file":"how-to/system-debugging", "os":["linux"]},
-    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},
+    {
+        "file": "how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/llm-inference-frameworks",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/model-acceleration-libraries",
+        "os": ["linux"],
+    },
+    {"file": "how-to/llm-fine-tuning-optimization/model-quantization", "os": ["linux"]},
+    {
+        "file": "how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/optimizing-triton-kernel",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
+        "os": ["linux"],
+    },
+    {"file": "how-to/system-optimization/index", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
+    {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
+    {"file": "how-to/system-debugging", "os": ["linux"]},
+    {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
 ]

-exclude_patterns = ['temp']
-
 external_toc_path = "./sphinx/_toc.yml"

 extensions = ["rocm_docs", "sphinx_reredirects"]

 external_projects_current_project = "rocm"

+html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm-stg.amd.com")
+html_context = {}
+if os.environ.get("READTHEDOCS", "") == "True":
+    html_context["READTHEDOCS"] = True
+
 html_theme = "rocm_docs_theme"
 html_theme_options = {"flavor": "rocm-docs-home"}

 html_static_path = ["sphinx/static/css"]
-html_css_files = ["rocm_custom.css"]
+html_css_files = ["rocm_custom.css", "rocm_rn.css"]

 html_title = "ROCm Documentation"

-html_theme_options = {
-    "link_main_doc": False
-}
+html_theme_options = {"link_main_doc": False}

-redirects = {
-    "reference/openmp/openmp": "../../about/compatibility/openmp.html"
-}
+redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
+
+numfig = False
--- a/docs/contribute/contributing.md
+++ b/docs/contribute/contributing.md
@@ -56,6 +56,10 @@ To make edits to our documentation via PR, follow these steps:
 6. Change directory into the `./docs` folder and make any documentation changes locally using your preferred code editor. Follow the guidelines listed on the
   [documentation structure](./doc-structure.md) page.

+   ```{note}
+   Spell checking is performed for pull requests by {doc}`ROCm Docs Core<rocm-docs-core:index>`. To ensure your PR passes spell checking you might need at add new words or acronyms to the `.wordlist.txt` file as described in {doc}`Spell Check<rocm-docs-core:user_guide/spellcheck>`. 
+   ```
+
 7. Optionally run a local test build of the documentation to ensure the content builds and looks as expected. In your terminal, run the following commands from within the `./docs` folder of your cloned repository:

     ```bash
--- a/docs/contribute/feedback.md
+++ b/docs/contribute/feedback.md
@@ -12,8 +12,7 @@ There are four standard ways to provide feedback on this repository.

 All contributions to ROCm documentation should arrive via the
 [GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
-targeting the develop branch of the repository. If you are unable to contribute
-via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
+targeting the develop branch of the repository.

 For more in-depth information on creating a pull request (PR), see
 [Contributing](./contributing.md).
@@ -30,7 +29,3 @@ and follow along on via public announcements.

 Issues on existing or absent documentation can be filed in
 [GitHub Issues](https://github.com/ROCm/ROCm/issues).
-
-## Email
-
-Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
--- a/docs/data/how-to/framework_install_2024_05_23.png
+++ b/docs/data/how-to/framework_install_2024_05_23.png
--- a/docs/data/how-to/framework_install_2024_07_04.png
+++ b/docs/data/how-to/framework_install_2024_07_04.png
--- a/docs/data/how-to/tuning-guides/mi300a-rocm-bandwidth-test-output.png
+++ b/docs/data/how-to/tuning-guides/mi300a-rocm-bandwidth-test-output.png
--- a/docs/data/how-to/tuning-guides/mi300a-rocm-peak-bandwidth-output.png
+++ b/docs/data/how-to/tuning-guides/mi300a-rocm-peak-bandwidth-output.png
--- a/docs/data/how-to/tuning-guides/mi300a-rocm-smi-output.png
+++ b/docs/data/how-to/tuning-guides/mi300a-rocm-smi-output.png
--- a/docs/data/how-to/tuning-guides/mi300a-rocm-smi-showhw-output.png
+++ b/docs/data/how-to/tuning-guides/mi300a-rocm-smi-showhw-output.png
--- a/docs/data/how-to/tuning-guides/mi300a-rocm-smi-showtopo-output.png
+++ b/docs/data/how-to/tuning-guides/mi300a-rocm-smi-showtopo-output.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
--- a/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
--- a/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
--- a/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
+++ b/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
--- a/docs/data/how-to/tuning-guides/rocminfo.png
+++ b/docs/data/how-to/tuning-guides/rocminfo.png
--- a/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
--- a/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
--- a/docs/data/reference/banner-compilers.jpg
+++ b/docs/data/reference/banner-compilers.jpg
--- a/docs/data/reference/banner-runtimes.jpg
+++ b/docs/data/reference/banner-runtimes.jpg
--- a/docs/data/reference/compatibility-matrix-historical-6.0.csv
+++ b/docs/data/reference/compatibility-matrix-historical-6.0.csv
@@ -0,0 +1,111 @@
+ROCm Version,6.2.0, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`,Ubuntu 24.04,,,,,
+      ,"Ubuntu 22.04.5 [#Ubuntu220405-past-60]_, 22.04.4","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
+      ,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.4, 9.3","RHEL 9.4 [#red-hat94-past-60]_, 9.3, 9.2","RHEL 9.4 [#red-hat94-past-60]_, 9.3, 9.2","RHEL 9.4 [#red-hat94-past-60]_, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,Oracle Linux 8.9 [#oracle89-past-60]_,Oracle Linux 8.9 [#oracle89-past-60]_,Oracle Linux 8.9 [#oracle89-past-60]_,,,
+      ,".. _architecture-support-compatibility-matrix-past-60:",,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,".. _gpu-support-compatibility-matrix-past-60:",,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+      ,,,,,,
+      FRAMEWORK SUPPORT,".. _framework-support-compatibility-matrix-past-60:",,,,,
+      :doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.16.1, 2.15.1, 2.14.1","2.15, 2.14, 2.13","2.15, 2.14, 2.13","2.15, 2.14, 2.13","2.14, 2.13, 2.12","2.14, 2.13, 2.12"
+      :doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+      ,,,,,,
+      THIRD PARTY COMMS,".. _thirdpartycomms-support-compatibility-matrix-past-60:",,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0,>=1.2.0,>=1.2.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,
+      THIRD PARTY ALGORITHM,".. _thirdpartyalgorithm-support-compatibility-matrix-past-60:",,,,,
+      Thrust,2.2.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.2.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      ,,,,,,
+      ML & COMPUTER VISION,".. _mllibs-support-compatibility-matrix-past-60:",,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.10.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocDecode <rocdecode:index>`,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.8.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.1.0,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,
+      COMMUNICATION,".. _commlibs-support-compatibility-matrix-past-60:",,,,,
+      :doc:`RCCL <rccl:index>`,2.20.5,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      ,,,,,,
+      MATH LIBS,".. _mathlibs-support-compatibility-matrix-past-60:",,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.2.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.8.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.2.0,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.1.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.1,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.0,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.2.0,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.28,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.1.0,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.26.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.2.0,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.5.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,
+      PRIMITIVES,".. _primitivelibs-support-compatibility-matrix-past-60:",,,,,
+      :doc:`hipCUB <hipcub:index>`,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.2.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,
+      SUPPORT LIBS,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.2.41133,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.2.0,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,
+      SYSTEM MGMT TOOLS,".. _tools-support-compatibility-matrix-past-60:",,,,,
+      :doc:`AMD SMI <amdsmi:index>`,24.6.2,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.0.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.3.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.2.0,rocm-6.1.2,rocm-6.1.1,rocm-6.1.0,rocm-6.0.2,rocm-6.0.0
+      ,,,,,,
+      PERFORMANCE TOOLS,,,,,,
+      :doc:`Omniperf <omniperf:index>`,2.0.1,N/A,N/A,N/A,N/A,N/A
+      :doc:`Omnitrace <omnitrace:index>`,1.11.2,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60200,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.4.0,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60200,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,
+      DEVELOPMENT TOOLS,,,,,,
+      :doc:`HIPIFY <hipify:index>`,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.13.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.2.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,
+      COMPILERS,".. _compilers-support-compatibility-matrix-past-60:",,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      `Flang <https://github.com/ROCm/flang>`_,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `llvm-project <https://github.com/ROCm/llvm-project>`_,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,18.0.0.24232,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      ,,,,,,
+      RUNTIMES,".. _runtime-support-compatibility-matrix-past-60:",,,,,
+      :doc:`HIP <hip:index>`,6.2.41133,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/data/rocm-software-stack-6_1_0.jpg
+++ b/docs/data/rocm-software-stack-6_1_0.jpg
--- a/docs/data/rocm-software-stack-6_2_0.jpg
+++ b/docs/data/rocm-software-stack-6_2_0.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
--- a/docs/data/shared/mi300-node-level-arch.png
+++ b/docs/data/shared/mi300-node-level-arch.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
--- a/docs/data/shared/xcd-sys-arch.png
+++ b/docs/data/shared/xcd-sys-arch.png
--- a/docs/data/unused-images/banner-optimization.jpg
+++ b/docs/data/unused-images/banner-optimization.jpg
--- a/docs/how-to/build-rocm.rst
+++ b/docs/how-to/build-rocm.rst
@@ -0,0 +1,23 @@
+.. meta::
+    :description: Build ROCm from source
+    :keywords: build ROCm, source, ROCm source, ROCm, repo, make, makefile
+
+
+.. _building-rocm:
+
+*************************************************************
+Build ROCm from source
+*************************************************************
+
+ROCm is an open-source stack from which you can build from source code. The source code is available from `<https://github.com/ROCm/ROCm>`__.
+
+
+The general steps to build ROCm are:
+
+#. Clone the ROCm source code
+#. Prepare the build environment
+#. Run the build command
+
+Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.  
+For detailed build instructions, see `Build ROCm from source <https://github.com/ROCm/ROCm?tab=readme-ov-file#build-rocm-from-source>`_
+
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -13,13 +13,13 @@ frameworks to ensure that framework-specific optimizations take advantage of AMD

 The following guides cover installation processes for ROCm-aware deep learning frameworks.

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
-* :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
+* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
+* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

 The following chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

-.. image:: ../data/how-to/framework_install_2024_05_23.png
+.. image:: ../data/how-to/framework_install_2024_07_04.png
   :alt: Flowchart for installing ROCm-aware machine learning frameworks
   :align: center

--- a/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
@@ -135,6 +135,8 @@ Installing vLLM

            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}

+Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.
+
 .. _fine-tuning-llms-tgi:

 Hugging Face TGI
--- a/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
@@ -8,6 +8,8 @@ Model acceleration libraries

 This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.

+.. _acceleration-flash-attention:
+
 Flash Attention 2
 =================

@@ -248,4 +250,4 @@ page describes the options.
   :align: center

 Learn more about optimizing kernels with TunableOp in
-:ref:`Optimizing Triton kernels <fine-tuning-llms-triton-tunableop>`.
+:ref:`Optimizing Triton kernels <mi300x-tunableop>`.
--- a/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
@@ -161,6 +161,7 @@ kernels by configuring the ``exllama_config`` parameter as the following.
                           base_model_name,
                           device_map="auto",
                           quantization_config=gptq_config)
+
 bitsandbytes
 ============

--- a/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
@@ -37,14 +37,14 @@ Setting up the base implementation environment
 ----------------------------------------------

 #. Install PyTorch for ROCm. Refer to the
-   :doc:`PyTorch installation guide <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`. For consistent
+   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

 #. In the Docker container, check the availability of ROCM-capable accelerators using the following command.

   .. code-block:: shell

-      rocm-smi -showproductname
+      rocm-smi --showproductname

 #. Check that your accelerators are available to PyTorch.

@@ -95,7 +95,7 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
   # Load base model to GPU memory
   base_model = AutoModelForCausalLM.from_pretrained(
           base_model_name, 
-           device_map = "auto"
+           device_map = "auto",
           trust_remote_code = True)
   ...
   # Run training
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
@@ -6,378 +6,24 @@
 Optimizing Triton kernels
 *************************

-This section introduces the general steps for `Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
-Triton kernel optimization is similar to HIP and CUDA kernel optimization.
+This section introduces the general steps for 
+`Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
+Triton kernel optimization is similar to :doc:`HIP <hip:how-to/performance_guidelines>`
+and CUDA kernel optimization.

-.. _fine-tuning-llms-triton-memory-access-efficiency:
+Refer to the
+:ref:`Triton kernel performance optimization <mi300x-triton-kernel-performance-optimization>`
+section of the :doc:`/how-to/tuning-guides/mi300x/workload` guide
+for detailed information.

-Memory access efficiency
-========================
+Triton kernel performance optimization includes the following topics.

-The accelerator or GPU contains global memory, local data share (LDS), and registers. Global memory has high access
-latency, but is large. LDS access has much lower latency, but is smaller. Register access is the fastest yet smallest
-among the three.
+* :ref:`mi300x-autotunable-kernel-config`

-So, the data in global memory should be loaded and stored as few times as possible. If different threads in a block
-need to access the same data, these data should be first transferred from global memory to LDS, then accessed by
-different threads in a workgroup.
+* :ref:`mi300x-mlir-analysis`

-.. _fine-tuning-llms-triton-hardware-resource-utilization:
+* :ref:`mi300x-assembly-analysis`

-Hardware resource utilization
-=============================
+* :ref:`mi300x-torchinductor-tuning`

-Each accelerator or GPU has multiple Compute Units (CUs) and various CUs do computation in parallel. So, how many CUs
-can a compute kernel can allocate its task to? For the :doc:`AMD MI300X accelerator <../../reference/gpu-arch-specs>`, the
-grid should have at least 1024 thread blocks or workgroups.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/compute-unit.png
-
-   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.
-
-To increase hardware utilization and maximize parallelism, it is necessary to design algorithms that can exploit more
-parallelism. One approach to achieving this is by using larger split-K techniques for General Matrix Multiply (GEMM)
-operations, which can further distribute the computation across more CUs, thereby enhancing performance.
-
-.. tip::
-
-   You can query hardware resources with the command ``rocminfo`` (in the ``/opt/rocm/bin`` directory). For instance,
-   query the number of CUs, number of SIMD, and wavefront size using the following commands.
-
-   .. code-block:: shell
-
-      rocminfo | grep "Compute Unit"
-
-      rocminfo | grep "SIMD"
-
-      rocminfo | grep "Wavefront Size"
-
-   On an MI300X device, there are 304 CUs, 4 SIMD per CU, and the wavefront size (warp size) is 64. See :doc:`Hardware
-   specifications <../../reference/gpu-arch-specs>` for a full list of AMD accelerators and GPUs.
-
-.. _fine-tuning-llms-triton-ir-analysis:
-
-IR analysis
-===========
-
-In Triton, there are several layouts including *blocked*, *shared*, *sliced*, and *MFMA*.
-
-From the Triton GPU IR (intermediate representation), you can know in which memory each computation is
-performed. The following is a snippet of IR from the Flash Attention decode ``int4`` key-value program. It is to
-de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
-
-.. code-block::
-
-   %190 = tt.load %189 {cache = 1 : i32, evict = 1 : i32, isVolatile =
-   false} : tensor<1x64xi32, #blocked6> loc(#loc159)
-
-   %266 = arith.andi %190, %cst_28 : tensor<1x64xi32, #blocked6>
-   loc(#loc250)
-
-   %267 = arith.trunci %266 : tensor<1x64xi32, #blocked6> to
-   tensor<1x64xi16, #blocked6> loc(#loc251)
-
-   %268 = tt.bitcast %267 : tensor<1x64xi16, #blocked6> -> tensor<1x64xf16,
-   #blocked6> loc(#loc252)
-
-   %269 = triton_gpu.convert_layout %268 : (tensor<1x64xf16, #blocked6>) ->
-   tensor<1x64xf16, #shared1> loc(#loc252)
-
-   %270 = tt.trans %269 : (tensor<1x64xf16, #shared1>) -> tensor<64x1xf16,
-   #shared2> loc(#loc194)
-
-   %276 = triton_gpu.convert_layout %270 : (tensor<64x1xf16, #shared2>) ->
-   tensor<64x1xf16, #blocked5> loc(#loc254)
-
-   %293 = arith.mulf %276, %cst_30 : tensor<64x1xf16, #blocked5>
-   loc(#loc254)
-
-   %295 = arith.mulf %292, %294 : tensor<64x32xf16, #blocked5> loc(#loc264)
-
-   %297 = arith.addf %295, %296 : tensor<64x32xf16, #blocked5> loc(#loc255)
-
-   %298 = triton_gpu.convert_layout %297 : (tensor<64x32xf16, #blocked5>)
-   -> tensor<64x32xf16, #shared1> loc(#loc255)
-
-   %299 = tt.trans %298 : (tensor<64x32xf16, #shared1>) ->
-   tensor<32x64xf16, #shared2> loc(#loc196)
-
-   %300 = triton_gpu.convert_layout %299 : (tensor<32x64xf16, #shared2>) ->
-   tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth
-   = 4}>> loc(#loc197)
-
-From the IR, you can see ``i32`` data is loaded from global memory to registers. With a few element-wise operations in
-registers, then it is stored in shared memory for the transpose operation, which needs data movement across different
-threads. With the transpose done, it is loaded from LDS to register again, and with a few more element-wise operations,
-they are stored in LDS again. The last step is to load from LDS to registers and convert to the dot-operand layout.
-
-From the IR, you can see that it uses the LDS twice: one for the transpose, and the other to convert the blocked layout
-to a dot-operand layout.
-
-Assembly analysis
-=================
-
-In the ISA, ensure ``global_load_dwordx4`` is used, especially when the
-load happens in a loop.
-
-In most cases, the LDS load and store should use ``_b128`` as well to
-minimize the number of LDS access instructions. Note that upstream (or backend) might not have ``_b128`` LDS read/write,
-so it uses ``_b64``. For most cases, no matter if you use fork or upstream,
-the LDS access should have ``_b64`` vector width.
-
-The AMD ISA has the ``s_waitcnt`` instruction to synchronize the dependency
-of memory access and computations. The ``s_waitcnt`` instruction can
-have two signals, typically in the context of Triton:
-
-* ``lgkmcnt(n):`` `lgkm` stands for LDS, GDS, Constant and Message.
-
-  In this context, it is often related to LDS access. The number ``n`` here means the number of such accesses that can
-  be left out to continue. For example, 0 means all ``lgkm`` access must finish before continuing, and 1 means only 1
-  ``lgkm`` access can be still running asynchronously before proceeding.
-
-* ``vmcnt(n):`` `vm` means vector memory.
-
-  This happens when vector memory is accessed, for example, when global load moves from global memory to vector memory.
-  Again, the number ``n`` here means the number of accesses that can be left out to continue.
-
-Generally recommended guidelines are as follows.
-
-*  Vectorize memory access as much as possible.
-
-*  Ensure synchronization is done efficiently.
-
-*  Overlap of instructions to hide latency, but it requires thoughtful
-   analysis of the algorithms.
-
-*  If you find inefficiencies, you can trace it back to LLVM IR, TTGIR
-   and even TTIR to see where the problem comes from. If you find it
-   during compiler optimization, activate the MLIR dump and check which
-   optimization pass caused the problem.
-
-.. _fine-tuning-llms-triton-kernel-occupancy:
-
-Kernel occupancy
-================
-
-1. Get the VGPR count, search for ``.vgpr_count`` in the ISA (for example, ``N``).
-
-2. Get the allocated LDS following the steps (for example, L for the kernel).
-
-   a. ``export MLIR_ENABLE_DUMP=1``
-
-   b. ``rm -rf ~/.triton/cache``
-
-   c. ``python kernel.py | | grep "triton_gpu.shared = " | tail -n 1``
-
-   d. You should see something like ``triton_gpu.shared = 65536``, indicating 65536 bytes of LDS are allocated for the
-      kernel.
-
-3. Get number of waves per workgroup using the following steps (for example, ``nW``).
-
-   a. ``export MLIR_ENABLE_DUMP=1``
-
-   b. ``rm -rf ~/.triton/cache``
-
-   c. ``python kernel.py | | grep "triton_gpu.num-warps " | tail -n 1``
-
-   d. You should see something like ``“triton_gpu.num-warps" = 8``, indicating 8 waves per workgroup.
-
-4. Compute occupancy limited by VGPR based on N according to the following table. For example, waves per EU as
-   ``occ_vgpr``.
-
-.. _fine-tuning-llms-occupancy-vgpr-table:
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
-   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
-   :align: center
-
-5. Compute occupancy limited by LDS based on L by: ``occ_lds = floor(65536 / L)``.
-
-6. Then the occupancy is ``occ = min(floor(occ_vgpr * 4 / nW), occ_lds) * nW / 4``
-
-   a. ``occ_vgpr \* 4`` gives the total number of waves on all 4 execution units (SIMDs)
-      per CU.
-
-   b. ``floor(occ_vgpr * 4 / nW)`` gives the occupancy of workgroups per CU
-      regrading VGPR usage.
-
-   c. The true ``occ`` is the minimum of the two.
-
-.. _fine-tuning-llms-triton-kernel-configs-env-vars:
-
-Auto-tunable kernel configurations and environment variables
-============================================================
-
-This section relates to the amount of :ref:`memory access <fine-tuning-llms-triton-memory-access-efficiency>` and
-computation assigned to each CU. It is related to the usage of LDS, registers and the scheduling of different tasks on
-a CU.
-
-The following is a list of kernel arguments used for tuning.
-
-``num_stages=n``
-   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
-   according to the following rules:
-
-   * For kernels with a single GEMM, set to ``0``.
-
-   * For kernels with two GEMMs fused (Flash Attention, or any other kernel
-     that fuses 2 GEMMs), set to ``1``.
-
-   * For kernels that fuse a single GEMM with another non-GEMM operator
-     (for example ReLU activation), set to ``0``.
-
-   * For kernels that have no GEMMs, set to ``1``.
-
-``waves_per_eu=n``
-   Helps to manage Vector General Purpose Registers (VGPR) usage to achieve desired occupancy levels. This argument
-   hints to the compiler to reduce VGPR to achieve ``n`` occupancy. See
-   :ref:`Kernel occupancy <fine-tuning-llms-triton-kernel-occupancy>` for more information about how to compute
-   occupancy. 
-
-   This argument is useful if:
-
-   * The occupancy of the kernel is limited by VGPR usage.
-
-   * The current VGPR usage is only a few above a boundary in
-     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <fine-tuning-llms-occupancy-vgpr-table>`.
-
-   For example, according to the table, the available VGPR is 512 per Execution Unit (EU), and VGPU is allocated at the
-   unit of 16. If the current VGPR usage is 170, the actual requested VGPR will be 176, so the
-   occupancy is only 2 waves per CU since :math:`176 \times 3 > 512`. So, if you set
-   ``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
-   that it might fit 3 waves per EU.
-
-``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
-   Tile sizes to be tuned to balance the memory-to-computation ratio. You want tile sizes large enough to
-   maximize the efficiency of memory-to-computation ratio, but small enough to parallelize the greatest number of
-   workgroups at the grid level.
-
-``matrix_instr_nonkdim``
-   Experimental feature for Flash Attention-like kernels that determines the size of the Matrix Fused Multiply-Add
-   (MFMA) instruction used.
-
-   -  ``Matrix_instr_nonkdim = 16``: ``mfma_16x16`` is used.
-
-   -  ``Matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.
-
-   For GEMM kernels on an AMD MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
-   tile/GEMM sizes.
-
-The following is an environment variable used for tuning.
-
-``OPTIMIZE_EPILOGUE``
-   Setting this variable to ``1`` can improve performance by removing the ``convert_layout`` operation in the epilogue.
-   It should be turned on (set to ``1``) in most cases. Setting ``OPTIMIZE_EPILOGUE=1`` stores the MFMA instruction
-   results in the MFMA layout directly; this comes at the cost of reduced global store efficiency, but the impact on
-   kernel execution time is usually minimal.
-
-   By default (``0``), the results of MFMA instruction are converted to blocked layout, which leads to ``global_store``
-   with maximum vector length, that is ``global_store_dwordx4``.
-
-   This is done implicitly with LDS as the intermediate buffer to achieve
-   data exchange between threads. Padding is used in LDS to avoid bank
-   conflicts. This usually leads to extra LDS usage, which might reduce
-   occupancy.
-
-   .. note::
-
-      This variable is not turned on by default because it only
-      works with ``tt.store`` but not ``tt.atomic_add``, which is used in split-k and
-      stream-k GEMM kernels. In the future, it might be enabled with
-      ``tt.atomic_add`` and turned on by default.
-
-   See :ref:`IR analysis <fine-tuning-llms-triton-ir-analysis>`.
-
-TorchInductor with Triton tuning knobs
-===========================================
-
-The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (``conv``) operations in PyTorch
-using ``inductor``, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better
-performance.
-
-Learn more about TorchInductor environment variables and usage in
-`PyTorch documentation <https://pytorch.org/docs/2.3/torch.compiler_inductor_profiling.html>`_.
-
-To enable a ``gemm``/``conv`` lowering to Triton, it requires use of ``inductor``’s ``max_autotune`` mode. This benchmarks a
-static list of Triton configurations (``conv`` configurations for max auto-tune + ``matmul`` configurations for max
-auto-tune) and uses the fastest for each shape. Note that the Triton is not used if regular :doc:`MIOpen <miopen:index>`
-or :doc:`rocBLAS <rocblas:index>` is faster for a specific operation.
-
-* Set ``torch._inductor.config.max_autotune = True`` or ``TORCHINDUCTOR_MAX_AUTOTUNE=1``.
-
-* Or, for more fine-grained control:
-
-  ``torch._inductor.config.max_autotune.pointwise = True``
-     To enable tuning for ``pointwise``/``reduction`` ops.
-
-  ``torch._inductor.config.max_autotune_gemm = True``
-     To enable tuning or lowering of ``mm``/``conv``\s.
-
-  ``torch._inductor.max_autotune_gemm_backends/TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS``
-     To select the candidate backends for ``mm`` auto-tuning. Defaults to
-     ``TRITON,ATEN,NV``. This also includes the ``CUTLASS`` tuning option. Limiting this to
-     ``TRITON`` might improve performance by enabling more fused ``mm`` kernels
-     instead of going to rocBLAS.
-
-* For ``mm`` tuning, tuning ``coordinate_descent`` might improve performance.
-
-  ``torch._inductor.config.coordinate_descent_tuning = True`` or ``TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1``
-
-* Inference can see large improvements on AMD GPUs by utilizing
-  ``torch._inductor.config.freezing=True`` or the ``TORCHINDUCTOR_FREEZING=1`` variable, which
-  in-lines weights as constants and enables constant folding optimizations.
-
-* Enabling ``inductor``’s cpp_wrapper might improve overhead. This generates
-  C++ code which launches Triton binaries directly with
-  ``hipModuleLaunchKernel`` and relies on `hipification`.
-
-* For NHWC convolutions workloads
-  ``torch._inductor.config.layout_optimization=True`` or ``TORCHINDUCTOR_LAYOUT_OPTIMIZATION=``
-  can help be enforcing channels_last format throughout the graph avoiding
-  any additional transposes added by ``inductor``. Note that
-  ``PYTORCH_MIOPEN_SUGGEST_NHWC=1`` is recommended if using this.
-
-* Extracting the Triton kernel ``TORCH_COMPILE_DEBUG`` creates a
-  ``torch_compile_debug/`` directory at current path, in the ``output_code.py``
-  the code-strings for the Triton kernels that are defined. Manual work is
-  then required to strip out the kernel and create kernel
-  compilation and launch via Triton.
-
-Other guidelines
-================
-
-* Performance-critical HIP provides an environment variable, ``export HIP_FORCE_DEV_KERNARG=1``,
-  that can put HIP kernel arguments directly to
-  device memory to reduce the latency of accessing kernel arguments. It
-  can reduce 2 to 3 μs for some kernels. Setting this variable for the FA
-  decode containing ``splitK`` and reduced kernels can reduce the total time
-  by around 6 μs in the benchmark test.
-
-* Set the clock to deterministic. Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed to
-  1900MHz instead of the default 2100MHz. This can reduce the chance of clock speed decrease due to chip high temperature
-  by setting a lower cap. You can restore this setting to its default value with ``rocm-smi -r``.
-
-* Set Non-Uniform Memory Access (NUMA) auto-balance. Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the
-  current setting. An output of ``0`` indicates this setting is available. If output is ``1``, run the command
-  ``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to set this.
-
-For these settings, the ``env_check.sh`` script automates the setting, resetting, and checking of the such
-environments. Find the script at `<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
-
-.. _fine-tuning-llms-triton-tunableop:
-
-TunableOp
---------
-`TunableOp <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
-is a feature used to define and optimize kernels that can have tunable parameters. This is useful in
-optimizing the performance of custom kernels by exploring different parameter configurations to find the most efficient
-setup. See more about PyTorch TunableOp :ref:`Model acceleration libraries <fine-tuning-llms-pytorch-tunableop>`.
-
-You can easily manipulate the behavior TunableOp through environment variables, though you could use the C++ interface
-``at::cuda::tunable::getTuningContext()``. A Python interface to the ``TuningContext`` does not yet exist.
-
-The default value is ``0``, which means only 1 iteration is attempted. Remember: there’s an overhead to tuning. To try
-and minimize the overhead, only a limited number of iterations of a given operation are attempted. If you set this to
-``10``, each solution for a given operation can run as many iterations as possible within 10ms. There is a hard-coded
-upper limit of 100 iterations attempted per solution. This is a tuning parameter; if you want the tunings to be chosen
-based on an average over multiple iterations, increase the allowed tuning duration.
+* :ref:`mi300x-compute-kernel-occ`
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
@@ -6,7 +6,7 @@

 # Optimizing with Composable Kernel

-The AMD ROCm&trade; Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
+The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.

 This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.

--- a/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
@@ -6,212 +6,24 @@
 Profiling and debugging
 ***********************

-This section discusses profiling and debugging tools and some of their common usage patterns with ROCm applications.
+This section provides an index for further documentation on  profiling and
+debugging tools and their common usage patterns.

-PyTorch Profiler
-================
+See :ref:`AMD Instinct MI300X™ workload optimization <mi300x-profiling-start>`
+for a conceptual summary of the workload profiling workflow for ROCm applications
+on AMD hardware -- including fine-tuning LLMs.

-`PyTorch Profiler <https://pytorch.org/docs/stable/profiler.html>`_ can be invoked inside Python scripts, letting you
-collect CPU and GPU performance metrics while the script is running. See the `PyTorch Profiler tutorial
-<https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ for more information.
+There, you'll find information on higher-level and kernel-level profiling tools
+as well as other profiling and debugging suggestions.

-You can then visualize and view these metrics using an open-source profile visualization tool like
-`Perfetto UI <https://ui.perfetto.dev>`_.
+* :ref:`PyTorch Profiler <mi300x-pytorch-profiler>`

-#. Use the following snippet to invoke PyTorch Profiler in your code.
+* :ref:`ROCm profiling tools <mi300x-profiling-tools>`

-   .. code-block:: python
+  * :ref:`ROCProfiler <mi300x-rocprof>`

-      import torch
-      import torchvision.models as models
-      from torch.profiler import profile, record_function, ProfilerActivity
-      model = models.resnet18().cuda()
-      inputs = torch.randn(2000, 3, 224, 224).cuda()
-      
-      with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-          with record_function("model_inference"):
-              model(inputs)
-      prof.export_chrome_trace("resnet18_profile.json")
+  * :ref:`Omniperf <mi300x-omniperf>`

-#. Profile results in ``resnet18_profile.json`` can be viewed by the Perfetto visualization tool. Go to
-   `<https://ui.perfetto.dev>`__ and import the file. In your Perfetto visualization, you'll see that the upper section
-   shows transactions denoting the CPU activities that launch GPU kernels while the lower section shows the actual GPU
-   activities where it processes the ``resnet18`` inferences layer by layer. 
-
-   .. figure:: ../../data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
-      
-      Perfetto trace visualization example.
-
-ROCm profiling tools
-====================
-
-Heterogenous systems, where programs run on both CPUs and GPUs, introduce additional complexities. Understanding the
-critical path and kernel execution is all the more important; so, performance tuning is a necessary component in the
-benchmarking process.
-
-With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
-using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
-working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these are:
-
-* :ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>`
-* :ref:`Omniperf <fine-tuning-llms-profiling-omniperf>`
-* :ref:`Omnitrace <fine-tuning-llms-profiling-omnitrace>`
-
-.. _fine-tuning-llms-profiling-rocprof:
-
-ROCProfiler
-----------
-:doc:`ROCProfiler <rocprofiler:index>` is primarily a low-level API for accessing and extracting GPU hardware performance
-metrics, commonly called *performance counters*. These counters quantify the performance of the underlying architecture
-showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.
-
-Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
-available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
-their execution.
-
-This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
-ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.
-
-.. note::
-
-   ``rocprof`` is a CLI-only utility so input and output takes the format of ``.txt`` and CSV files. These
-   formats provide a raw view of the data and puts the onus on the user to parse and analyze. Therefore, ``rocprof``
-   gives the user full access and control of raw performance profiling data, but requires extra effort to analyze the
-   collected data.
-
-.. _fine-tuning-llms-profiling-omniperf:
-
-Omniperf
--------
-`Omniperf <https://rocm.github.io/omniperf>`_ is a system performance profiler for high-performance computing (HPC) and
-machine learning (ML) workloads using Instinct accelerators. Under the hood, Omniperf uses
-:ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>` to collect hardware performance counters. The Omniperf tool performs
-system profiling based on all approved hardware counters for Instinct
-accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
-block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.
-
-Omniperf takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
-to collect and analyze raw CSV output files as is the case with ROC-profiler. Instead, Omniperf automates the collection
-of all available hardware counters in one command and provides a graphical interface to help users understand and
-analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.
-
-.. note::
-
-   Omniperf collects hardware counters in multiple passes, and will therefore re-run the application during each pass
-   to collect different sets of metrics.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
-
-   Omniperf memory chat analysis panel.
-
-In brief, Omniperf provides details about hardware activity for a particular GPU kernel. It also supports both
-a web-based GUI or command-line analyzer, depending on your preference.
-
-.. _fine-tuning-llms-profiling-omnitrace:
-
-Omnitrace
---------
-
-`Omnitrace <https://rocm.github.io/omnitrace>`_ is a comprehensive profiling and tracing tool for parallel applications,
-including HPC and ML packages, written in C, C++, Fortran, HIP, OpenCL, and Python which execute on the CPU or CPU and
-GPU. It is capable of gathering the performance information of functions through any combination of binary
-instrumentation, call-stack sampling, user-defined regions, and Python interpreter hooks.
-
-Omnitrace supports interactive visualization of comprehensive traces in the web browser in addition to high-level
-summary profiles with ``mean/min/max/stddev`` statistics. Beyond runtime
-information, Omnitrace supports the collection of system-level metrics such as CPU frequency, GPU temperature, and GPU
-utilization. Process and thread level metrics such as memory usage, page faults, context switches, and numerous other
-hardware counters are also included.
-
-.. tip::
-
-   When analyzing the performance of an application, it is best not to assume you know where the performance
-   bottlenecks are and why they are happening. Omnitrace is the ideal tool for characterizing where optimization would
-   have the greatest impact on the end-to-end execution of the application and to discover what else is happening on the
-   system during a performance bottleneck.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
-
-   Omnitrace timeline trace example.
-
-For details usage and examples of using these tools, refer to the
-`Introduction to profiling tools for AMD hardware <https://rocm.blogs.amd.com/software-tools-optimization/profilers/README.html>`_
-developer blog.
-
-Debugging with ROCr Debug Agent
-===============================
-
-:doc:`ROCr Debug Agent <rocr_debug_agent:index>`) is a library that can be loaded by the ROCm platform
-runtime (:doc:`ROCr <rocr-runtime:index>`) to provide the following functionalities for all AMD accelerators and GPUs
-supported by the ROCm Debugger API (:doc:`ROCdbgapi <rocdbgapi:index>`).
-
-* Print the state of all AMD accelerator or GPU wavefronts that caused a queue error; for example, causing a memory
-  violation, executing an ``s_trap2``, or executing an illegal instruction.
-
-* Print the state of all AMD accelerator or GPU wavefronts by sending a ``SIGQUIT`` signal to the process in question;
-  for example, by pressing ``Ctrl + \`` while the process is executing.
-
-Debugging memory access faults
------------------------------
-
-Identifying a faulting kernel is often enough to triage a memory access fault. To that end, the
-`ROCr Debug Agent <https://github.com/ROCm/rocr_debug_agent/>`_ can trap a memory access fault and provide a dump of all
-active wavefronts that caused the error as well as the name of the kernel. The
-`ROCr Debug Agent Library README <https://github.com/ROCm/rocr_debug_agent/blob/master/README.md>`_ provides full
-instructions, but in brief:
-
-*  Compiling with ``-ggdb -O0`` is recommended but not required.
-
-*  ``HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 HSA_ENABLE_DEBUG=1 ./my_program``
-
-When the debug agent traps the fault, it will produce an extremely
-verbose output of all wavefront registers and memory content.
-Importantly, it also prints something like:
-
-.. code-block:: shell
-
-   Disassembly for function vector_add_assert_trap(int*, int*, int*):
-
-   code object:
-   file:////rocm-debug-agent/build/test/rocm-debug-agent-test#offset=14309&size=31336
-
-   loaded at: [0x7fd4f100c000-0x7fd4f100e070]
-
-The kernel name and the code object file should be listed. In the
-example above, the kernel name is ``vector_add_assert_trap``, but this might
-also look like:
-
-.. code-block:: shell
-
-   Disassembly for function memory:///path/to/codeobject#offset=1234&size=567:
-
-In this case, it is an in-memory kernel that was generated at runtime.
-
-Using the following environment variable, the debug agent will save all code objects to the current directory (use
-``--save-code-objects=[DIR]`` to place them in another location). The code objects will be renamed from the URI format
-with special characters replaced by ``_``. 
-
-.. code-block:: shell
-
-   ROCM_DEBUG_AGENT_OPTIONS="--all --save-code-objects"
-
-Use the ``llvm-objdump`` command to disassemble the indicated in-memory
-code object that has now been saved to disk. The name of the kernel is
-often found inside the disassembled code object.
-
-.. code-block:: shell
-
-   llvm-objdump --disassemble-all path/to/code-object.co
-
-Consider turning off memory caching strategies both within the ROCm
-stack and PyTorch where possible. This will give the debug agent the
-best chance at finding the memory fault where it originates. Otherwise,
-it could be masked by writing past the end of a cached block within a
-larger allocation.
-
-.. code-block:: shell
-
-   PYTORCH_NO_HIP_MEMORY_CACHING=1
-
-   HSA_DISABLE_FRAGMENT_ALLOCATOR=1
+  * :ref:`Omnitrace <mi300x-omnitrace>`

+* :ref:`ROCr Debug Agent <mi300x-rocr-debug-agent>`
--- a/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
@@ -38,7 +38,7 @@ Setting up the base implementation environment
 ----------------------------------------------

 #. Install PyTorch for ROCm. Refer to the
-   :doc:`PyTorch installation guide <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`. For a consistent
+   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For a consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

 #. In the Docker container, check the availability of ROCm-capable accelerators using the following command.
@@ -103,7 +103,7 @@ Setting up the base implementation environment
      pip install peft
      
      # Install the other dependencies.
-      pip install transformers, datasets, huggingface-hub, scipy
+      pip install transformers datasets huggingface-hub scipy

 #. Check that the required packages can be imported.

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -16,10 +16,10 @@ Before getting started, install ROCm and supported machine learning frameworks.

      Each release of ROCm supports specific hardware and software configurations. Before installing, consult the
      :doc:`System requirements <rocm-install-on-linux:reference/system-requirements>` and
-      :doc:`Installation prerequisites <rocm-install-on-linux:how-to/prerequisites>` guides.
+      :doc:`Installation prerequisites <rocm-install-on-linux:install/prerequisites>` guides.

 If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for Linux
-<rocm-install-on-linux:tutorial/quick-start>`.
+<rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
 :doc:`Radeon installation instructions <radeon:docs/install/install-radeon>`.
@@ -53,8 +53,10 @@ ROCm supports popular machine learning frameworks and libraries including `PyTor
 Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
 images with the framework pre-installed.

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
-* :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
+
+* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
+
+* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`

 The sections that follow in :doc:`Training a model <train-a-model>` are geared for a ROCm with PyTorch installation.
--- a/docs/how-to/rocm-for-ai/train-a-model.rst
+++ b/docs/how-to/rocm-for-ai/train-a-model.rst
@@ -137,4 +137,4 @@ The following developer blogs showcase examples of how to fine-tune a model on a
 * Recipes for fine-tuning Llama2 and 3 with ``llama-recipes``

  * `meta-llama/llama-recipes: Scripts for fine-tuning Meta Llama3 with composable FSDP & PEFT methods to cover
-    single/multi-node GPUs <https://github.com/meta-llama/llama-recipes/tree/main/recipes/finetuning>`_
+    single/multi-node GPUs <https://github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/finetuning>`_
--- a/docs/how-to/system-debugging.md
+++ b/docs/how-to/system-debugging.md
@@ -5,7 +5,7 @@
  ROCm">
 </head>

-# System debugging guide
+# System debugging

 ## ROCm language and system-level debug, flags, and environment variables

@@ -65,4 +65,4 @@ Debug messages when developing/debugging base ROCm driver. You could enable the

 ## PCIe-debug

-For information on how to debug and profile HIP applications, see {doc}`hip:how_to_guides/debugging`
+For information on how to debug and profile HIP applications, see {doc}`hip:how-to/debugging`
--- a/docs/how-to/system-optimization/index.rst
+++ b/docs/how-to/system-optimization/index.rst
@@ -0,0 +1,115 @@
+.. meta::
+   :description: AMD hardware optimization for specific workloads
+   :keywords: high-performance computing, HPC, Instinct accelerators, Radeon,
+              tuning, tuning guide, AMD, ROCm
+
+*******************
+System optimization
+*******************
+
+This guide outlines system setup and tuning suggestions for AMD hardware to
+optimize performance for specific types of workloads or use-cases.
+
+High-performance computing workloads
+====================================
+
+High-performance computing (HPC) workloads have unique requirements. The default
+hardware and BIOS configurations for OEM platforms may not provide optimal
+performance for HPC workloads. To enable optimal HPC settings on a per-platform
+and per-workload level, this chapter describes:
+
+* BIOS settings that can impact performance
+* Hardware configuration best practices
+* Supported versions of operating systems
+* Workload-specific recommendations for optimal BIOS and operating system
+  settings
+
+There is also a discussion on the AMD Instinct™ software development
+environment, including information on how to install and run the DGEMM, STREAM,
+HPCG, and HPL benchmarks. This guide provides a good starting point but is
+not tested exhaustively across all compilers.
+
+Knowledge prerequisites to better understand this document and to perform tuning
+for HPC applications include:
+
+* Experience in configuring servers
+* Administrative access to the server's Management Interface (BMC)
+* Administrative access to the operating system
+* Familiarity with the OEM server's BMC (strongly recommended)
+* Familiarity with the OS specific tools for configuration, monitoring, and
+  troubleshooting (strongly recommended)
+
+This document provides guidance on tuning systems with various AMD Instinct
+accelerators for HPC workloads. The following sections don't comprise an
+all-inclusive guide, and some items referred to may have similar, but different,
+names in various OEM systems (for example, OEM-specific BIOS settings). This
+following sections also provide suggestions on items that should be the initial
+focus of additional, application-specific tuning.
+
+While this guide is a good starting point, developers are encouraged to perform
+their own performance testing for additional tuning.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - System optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Instinct MI300X <mi300x>`
+
+     - `AMD Instinct MI300 instruction set architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_
+
+     - `CDNA 3 architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI300A <mi300a>`
+
+     - `AMD Instinct MI300 instruction set architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_
+
+     - `CDNA 3 architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI200 <mi200>`
+
+     - `AMD Instinct MI200 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf>`_
+
+     - `CDNA 2 architecture <https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI100 <mi100>`
+
+     - `AMD Instinct MI100 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf>`_
+
+     - `CDNA architecture <https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf>`_
+
+Workstation workloads
+=====================
+
+Workstation workloads, much like those for HPC, have a unique set of
+requirements: a blend of both graphics and compute, certification, stability and
+others.
+
+The document covers specific software requirements and processes needed to use
+these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
+tasks.
+
+The main purpose of this document is to help users utilize the RDNA™ 2 GPUs to
+their full potential.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - System optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Radeon PRO W6000 and V620 <w6000-v620>`
+
+     - `AMD RDNA 2 instruction set architecture <https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf>`_
+
+     - `RDNA 2 architecture <https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf>`_
+
--- a/docs/how-to/system-optimization/mi100.md
+++ b/docs/how-to/system-optimization/mi100.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="MI100 high-performance computing and tuning guide">
-  <meta name="keywords" content="MI100, high-performance computing, HPC, tuning, BIOS
+  <meta name="keywords" content="MI100, high-performance computing, HPC, BIOS
  settings, NBIO, AMD, ROCm">
 </head>

-# MI100 high-performance computing and tuning guide
+# AMD Instinct MI100 system optimization

 ## System settings

--- a/Show More
+++ b/Show More