Test ROCm 6.1.2 build

Fix Markdown formate for the linter check
Update the branch of ROCm repo after testing
2026-01-10 23:28:03 -05:00 · 2024-06-05 14:48:12 -07:00 · 2024-06-05 10:41:25 -07:00 · 2024-06-05 09:50:20 -07:00 · 2024-06-04 21:20:04 -07:00
212 changed files with 11273 additions and 5577 deletions
--- a/.azuredevops/ci-builds/aomp.yml
+++ b/.azuredevops/ci-builds/aomp.yml
@@ -17,7 +17,11 @@ resources:
  pipelines:
  - pipeline: rocr-runtime_pipeline
    source: \ROCR-Runtime
-    trigger: true
+    trigger:
+      branches:
+        include:
+        - master
+
 # this job will only be triggered after successful build sequence of llvm-project and ROCR-Runtime

 trigger: none
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -84,10 +84,10 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_BUILD_TYPE=Release
-        -DGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
        -DMIGRAPHX_USE_COMPOSABLEKERNEL=OFF
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -16,15 +16,12 @@ parameters:
    - libbz2-dev
    - nlohmann-json3-dev
    - libgtest-dev
-    - libdrm-dev
 - name: rocmDependencies
  type: object
  default:
    - rocMLIR
    - rocRAND
    - rocBLAS
-    - hipBLAS
-    - hipBLASLt
    - half
    - composable_kernel
    - rocm-cmake
@@ -33,14 +30,13 @@ parameters:
    - rocprofiler-register
    - clr
    - rocminfo
-    - roctracer

 jobs:
 - job: MIOpen
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.LARGE_DISK_BUILD_POOL }}
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
  workspace:
    clean: all
  steps:
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -13,7 +13,7 @@ parameters:
    - libyaml-cpp-dev
    - libpci-dev
    - libpci3
-    - libgtest-dev
+    - googletest
    - git
 - name: rocmDependencies
  type: object
@@ -35,10 +35,6 @@ jobs:
  - template: /.azuredevops/variables-global.yml
  - name: HIP_ROCCLR_HOME
    value: $(Build.BinariesDirectory)/rocm
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  - name: HIP_INC_DIR
-    value: $(Agent.BuildDirectory)/rocm
  pool:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
@@ -63,17 +59,10 @@ jobs:
      parameters:
        dependencyList: ${{ parameters.rocmDependencies }}
        dependencySource: tag-builds
-# Set link to redirect llvm folder
-  - task: Bash@3
-    displayName: create symlink
-    inputs:
-      targetType: inline
-      script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCPACK_PACKAGING_INSTALL_PREFIX=$(Build.BinariesDirectory)
        -GNinja
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -12,7 +12,6 @@ parameters:
    - ninja-build
    - git
    - python3-pip
-    - libdrm-dev
 - name: rocmDependencies
  type: object
  default:
@@ -25,11 +24,10 @@ parameters:

 jobs:
 - job: composable_kernel
-  timeoutInMinutes: 100
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.ULTRA_BUILD_POOL }}
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
  workspace:
    clean: all
  steps:
@@ -59,6 +57,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DGPU_TARGETS=gfx942
+        -DGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -77,6 +77,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DHIP_PLATFORM=amd
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -8,13 +8,12 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - gfortran
-    - git
-    - libdrm-dev
-    - libmsgpack-dev
    - ninja-build
-    - python3-pip
    - python3-venv
+    - libmsgpack-dev
+    - git
+    - python3-pip
+    - libdrm-dev
 - name: pipModules
  type: object
  default:
@@ -22,16 +21,15 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - clr
-    - hipBLAS
    - llvm-project
+    - ROCR-Runtime
+    - clr
    - rocminfo
    - rocprofiler-register
-    - ROCR-Runtime
+    - hipBLAS

 jobs:
 - job: hipBLASLt
-  timeoutInMinutes: 100
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -60,7 +58,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
-# CI case: download latest default branch build
+  # CI case: download latest default branch build
  - ${{ if eq(parameters.checkoutRef, '') }}:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -74,42 +72,17 @@ jobs:
        dependencySource: tag-builds
  - script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
    displayName: ROCm symbolic link
-# Build and install gtest, lapack, hipBLAS-common
-# $(Pipeline.Workspace)/deps is a temporary folder for the build process
-# $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
-  - script: mkdir $(Pipeline.Workspace)/deps
-# hipBLASLt already has a CMake script for external deps, so we can just run that
-# https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-  - script: cmake $(Pipeline.Workspace)/s/deps
-    displayName: Configure hipBLASLt external dependencies
-    workingDirectory: $(Pipeline.Workspace)/deps
-  - script: make
-    displayName: Build hipBLASLt external dependencies
-    workingDirectory: $(Pipeline.Workspace)/deps
-  - script: sudo make install
-    displayName: Install hipBLASLt external dependencies
-    workingDirectory: $(Pipeline.Workspace)/deps
-# Set link to redirect llvm folder
-  - task: Bash@3
-    displayName: Symlink to rocm/lib/llvm
-    inputs:
-      targetType: inline
-      script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
-  - script: sudo chmod 777 /mnt
-    displayName: 'Set permissions for /mnt'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
-      cmakeBuildDir: /mnt/build
-      cmakeSourceDir: $(Pipeline.Workspace)/s
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx90a
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LIBRARY_FORMAT=msgpack
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -57,6 +57,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -66,7 +66,7 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -61,6 +61,6 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -74,6 +74,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_TESTS=ON
        -DUSE_CUDA=OFF
        -GNinja
--- a/.azuredevops/components/hipSPARSE.yml
+++ b/.azuredevops/components/hipSPARSE.yml
@@ -65,13 +65,3 @@ jobs:
        -DBUILD_CLIENTS_SAMPLES=OFF
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: hipSPARSE
-      publish: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-    parameters:
-      sourceDir: $(Build.SourcesDirectory)/build/clients
-      contentsString: matrices/**
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: testMatrices
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -75,7 +75,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=all
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -55,9 +55,9 @@ jobs:
      extraBuildFlags: >-
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+        -DROCM_PATH="$(Agent.BuildDirectory)/rocm"
        -DCMAKE_BUILD_TYPE=Release
        -DHIPTENSOR_BUILD_TESTS=ON
-        -DAMDGPU_TARGETS=gfx942
-      multithreadFlag: -- -j32
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -61,7 +61,6 @@ jobs:
      parameters:
        dependencyList: ${{ parameters.rocmDependencies }}
        dependencySource: tag-builds
-  - script: chmod +x $(Agent.BuildDirectory)/rocm/bin/hipify-perl
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
@@ -72,6 +71,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TESTS=ON
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -1,138 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - python3-pip
-    - python3-protobuf
-    - cmake
-    - ninja-build
-    - libprotobuf-dev
-    - libprotoc-dev
-    - protobuf-compiler
-    - liblmdb-dev
-    - pkg-config
-    - ffmpeg
-    - libavcodec-dev
-    - libavformat-dev
-    - libavutil-dev
-    - libswscale-dev
-    - libturbojpeg-dev
-    - libjpeg-turbo-official=3.0.2-20240124
-    - libopencv-dev
- name: pipModules
-  type: object
-  default:
-    - numpy
-    - opencv-python
-    - torch
-    - pillow
- name: rocmDependencies
-  type: object
-  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
-    - clr
-    - rocDecode
-    - half
-    - rpp
-    - MIVisionX
-    - aomp
-
-jobs:
- job: rocAL
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - task: Bash@3
-    displayName: 'Register libjpeg-turbo packages'
-    inputs:
-      targetType: inline
-      script: |
-        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-        wget -q -O- https://packagecloud.io/dcommander/libjpeg-turbo/gpgkey | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/libjpeg-turbo.gpg > /dev/null
-        echo "deb [signed-by=/etc/apt/trusted.gpg.d/libjpeg-turbo.gpg] https://packagecloud.io/dcommander/libjpeg-turbo/any/ any main" | sudo tee /etc/apt/sources.list.d/libjpeg-turbo.list
-        sudo apt update
-        apt-cache show libjpeg-turbo-official | grep Version
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - task: Bash@3
-    displayName: 'Clone PyBind11'
-    inputs:
-      targetType: inline
-      script: git clone --depth 1 -b v2.11.1 https://github.com/pybind/pybind11
-      workingDirectory: '$(Build.SourcesDirectory)'
-  - task: Bash@3
-    displayName: 'Clone RapidJSON'
-    inputs:
-      targetType: inline
-      script: git clone --depth 1 https://github.com/Tencent/rapidjson.git
-      workingDirectory: '$(Build.SourcesDirectory)'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: PyBind11
-      cmakeBuildDir: '$(Build.SourcesDirectory)/pybind11/build'
-      customInstallPath: false
-      installEnabled: false
-      extraBuildFlags: >-
-        -DDOWNLOAD_CATCH=ON
-        -DDOWNLOAD_EIGEN=ON
-        -GNinja
-  - task: Bash@3
-    displayName: 'Install PyBind11'
-    inputs:
-      targetType: inline
-      script: sudo cmake --build . --target install
-      workingDirectory: '$(Build.SourcesDirectory)/pybind11/build'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: RapidJSON
-      cmakeBuildDir: '$(Build.SourcesDirectory)/rapidjson/build'
-      customInstallPath: false
-      installEnabled: false
-      extraBuildFlags: >-
-        -GNinja
-  - task: Bash@3
-    displayName: 'Install RapidJSON'
-    inputs:
-      targetType: inline
-      script: sudo cmake --build . --target install
-      workingDirectory: '$(Build.SourcesDirectory)/rapidjson/build'
-  # CI case: download latest default branch build
-  - ${{ if eq(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: staging
-  # manual build case: triggered by ROCm/ROCm repo
-  - ${{ if ne(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: tag-builds
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;/opt/libjpeg-turbo
-        -DCMAKE_INSTALL_PREFIX_PYTHON=$Python3_STDARCH
-        -DCMAKE_BUILD_TYPE=Release
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -67,7 +67,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -108,7 +108,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LOGIC=asm_full
        -DTensile_SEPARATE_ARCHITECTURES=ON
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -64,7 +64,7 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -10,13 +10,6 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - git
-    - python3-pip
- name: rocmDependencies
-  type: object
-  default:
-    - llvm-project
-    - rocm-cmake

 jobs:
 - job: rocMLIR
@@ -24,6 +17,8 @@ jobs:
  - group: common
  - template: /.azuredevops/variables-global.yml
  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  container:
+    image: ${{ variables.DOCKER_IMAGE_NAME }}:${{ variables.LATEST_DOCKER_VERSION }}
  workspace:
    clean: all
  steps:
@@ -34,25 +29,13 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
-# CI case: download latest default branch build
-  - ${{ if eq(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: staging
-# manual build case: triggered by ROCm/ROCm repo
-  - ${{ if ne(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: tag-builds
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/amdclang++
+        -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/amdclang
+        -DCMAKE_PREFIX_PATH=/opt/rocm
        -DBUILD_FAT_LIBROCKCOMPILER=1
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -59,7 +59,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_BENCHMARK=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_TEST=ON
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -59,6 +59,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -82,7 +82,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -68,20 +68,10 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_SAMPLES=OFF
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/hip/cmake
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: rocSPARSE
-      publish: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-    parameters:
-      sourceDir: $(Build.SourcesDirectory)/build/clients
-      contentsString: matrices/**
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: testMatrices
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -60,7 +60,7 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_TEST=ON
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -65,7 +65,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DROCWMMA_BUILD_TESTS=ON
        -DROCWMMA_BUILD_SAMPLES=OFF
-        -DAMDGPU_TARGETS=gfx942
+        -DGPU_TARGETS=gfx1100
        -GNinja
 # gfx1030 not supported in documentation
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -5,30 +5,6 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
- name: aptPackages
-  type: object
-  default:
-    - libglfw3-dev
- name: rocmDependencies
-  type: object
-  default:
-    - AMDMIGraphX
-    - clr
-    - hipBLAS
-    - hipCUB
-    - HIPIFY
-    - hipRAND
-    - hipSOLVER
-    - hipSPARSE
-    - llvm-project
-    - rocBLAS
-    - rocPRIM
-    - rocprofiler-register
-    - ROCR-Runtime
-    - rocRAND
-    - rocSOLVER
-    - rocSPARSE
-    - rocThrust

 jobs:
 - job: rocm_examples
@@ -44,28 +20,5 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  # CI case: download latest default branch build
-  - ${{ if eq(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: staging
-  # manual build case: triggered by ROCm/ROCm repo
-  - ${{ if ne(parameters.checkoutRef, '') }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: tag-builds
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      # https://github.com/ROCm/HIP/issues/2203
-      extraBuildFlags: >-
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DROCM_ROOT=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_HIP_ARCHITECTURES=gfx942
-        -DCMAKE_EXE_LINKER_FLAGS=-fgpu-rdc
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocprofiler-register.yml
+++ b/.azuredevops/components/rocprofiler-register.yml
@@ -21,17 +21,4 @@ jobs:
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: rocprofiler-register
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: rocprofiler-register-tests
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
-      cmakeBuildDir: 'tests/build'
-      installEnabled: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: rocprofiler-register
-      testDir: 'tests/build'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -47,7 +47,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  - name: HIP_ROCCLR_HOME
+  - name: HIP_ROCCLR_HOME 
    value: $(Agent.BuildDirectory)/rocm
  - name: ROCM_PATH
    value: $(Agent.BuildDirectory)/rocm
@@ -68,7 +68,7 @@ jobs:
    displayName: 'Download aqlprofile'
    inputs:
      targetType: inline
-      script: wget -nv https://repo.radeon.com/rocm/misc/aqlprofile/ubuntu-22.04/hsa-amd-aqlprofile_1.0.0.60200.60200-crdnnh.14213~22.04_amd64.deb
+      script: wget -nv https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-amd-aqlprofile/hsa-amd-aqlprofile_1.0.0.60100.60100-82~22.04_amd64.deb
      workingDirectory: '$(Pipeline.Workspace)'
  - task: Bash@3
    displayName: 'Extract aqlprofile'
@@ -76,7 +76,7 @@ jobs:
      targetType: inline
      script: |
        mkdir hsa-amd-aqlprofile
-        dpkg-deb -R hsa-amd-aqlprofile_1.0.0.60200.60200-crdnnh.14213~22.04_amd64.deb hsa-amd-aqlprofile
+        dpkg-deb -R hsa-amd-aqlprofile_1.0.0.60100.60100-82~22.04_amd64.deb hsa-amd-aqlprofile
      workingDirectory: '$(Pipeline.Workspace)'
  - task: Bash@3
    displayName: 'Move aqlprofile'
@@ -84,7 +84,7 @@ jobs:
      targetType: inline
      script: |
        mkdir -p $(Agent.BuildDirectory)/rocm
-        cp -R hsa-amd-aqlprofile/opt/rocm-6.2.0-14213/* $(Agent.BuildDirectory)/rocm
+        cp -R hsa-amd-aqlprofile/opt/rocm-6.1.0/* $(Agent.BuildDirectory)/rocm
      workingDirectory: '$(Pipeline.Workspace)'
 # CI case: download latest default branch build
  - ${{ if eq(parameters.checkoutRef, '') }}:
@@ -105,5 +105,5 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DENABLE_LDCONFIG=OFF
        -DUSE_PROF_API=1
-        -DGPU_TARGETS=gfx942
+        -DGPU_TARGETS=gfx1030;gfx1100
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocr_debug_agent.yml
+++ b/.azuredevops/components/rocr_debug_agent.yml
@@ -15,13 +15,11 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
    - clr
    - llvm-project
    - ROCdbgapi
    - rocminfo
    - ROCR-Runtime
-    - rocprofiler-register

 jobs:
 - job: rocr_debug_agent
@@ -58,6 +56,5 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -65,6 +65,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DGPU_TARGETS=gfx942
+        -DGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rpp.yml
+++ b/.azuredevops/components/rpp.yml
@@ -60,6 +60,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIRS=$(Agent.BuildDirectory)/rocm/include
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx942
+        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -1,115 +0,0 @@
-parameters:
-# currently excludes clr and rocm-examples
- name: rocmDependencies
-  type: object
-  default:
-    - AMDMIGraphX
-    - amdsmi
-    - aomp-extras
-    - aomp
-    - composable_kernel
-    - half
-    - HIP
-    - hipBLAS
-    - hipBLASLt
-    - hipCUB
-    - hipFFT
-    - hipfort
-    - HIPIFY
-    - hipRAND
-    - hipSOLVER
-    - hipSPARSE
-    - hipSPARSELt
-    - hipTensor
-    - llvm-project
-    - MIOpen
-    - MIVisionX
-    - rccl
-    - rdc
-    - rocAL
-    - rocALUTION
-    - rocBLAS
-    - ROCdbgapi
-    - rocDecode
-    - rocFFT
-    - ROCgdb
-    - rocm-cmake
-    - rocm-core
-    - rocminfo
-    - rocMLIR
-    - ROCmValidationSuite
-    - rocm_bandwidth_test
-    - rocm_smi_lib
-    - rocPRIM
-    - rocprofiler-register
-    - rocprofiler
-    - ROCR-Runtime
-    - rocRAND
-    - rocr_debug_agent
-    - rocSOLVER
-    - rocSPARSE
-    - ROCT-Thunk-Interface
-    - rocThrust
-    - roctracer
-    - rocWMMA
-    - rpp
-
-trigger: none
-pr: none
-schedules:
- cron: '30 7 * * *'
-  displayName: Nightly build
-  branches:
-    include:
-    - develop
-  always: true
-
-jobs:
- job: rocm_nightly
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - task: DeleteFiles@1
-    displayName: 'Cleanup checkout space'
-    inputs:
-      SourceFolder: '$(Agent.BuildDirectory)/s'
-      Contents: '**/*'
-  - task: DeleteFiles@1
-    displayName: 'Cleanup Staging Area'
-    inputs:
-      SourceFolder: '$(Build.ArtifactStagingDirectory)'
-      Contents: '/**/*'
-      RemoveDotFiles: true
-  - script: sudo chmod 777 /mnt
-    displayName: 'Set permissions for /mnt'
-  - script: df -h
-    displayName: System disk space before ROCm
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      dependencyList: ${{ parameters.rocmDependencies }}
-      dependencySource: staging
-      extractToMnt: true
-      skipLibraryLinking: true
-  - script: df -h
-    displayName: System disk space after ROCm
-  - script: du -sh /mnt/rocm
-    displayName: Uncompressed ROCm size
-  - task: ArchiveFiles@2
-    displayName: Compress rocm-nightly
-    inputs:
-      rootFolderOrFile: /mnt/rocm
-      includeRootFolder: false
-      archiveType: tar
-      tarCompression: gz
-      archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204.tar.gz
-  - script: du -sh $(Build.ArtifactStagingDirectory)
-    displayName: Compressed ROCm size
-  - task: PublishPipelineArtifact@1
-    displayName: 'Public ROCm Nightly Artifact'
-    retryCountOnTaskFailure: 3
-    inputs:
-      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/tag-builds/rocAL.yml
+++ b/.azuredevops/tag-builds/rocAL.yml
@@ -1,29 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: checkoutRef
-  type: string
-  default: refs/tags/$(LATEST_RELEASE_TAG)
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-  - repository: release_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/rocAL
-    ref: ${{ parameters.checkoutRef }}
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/rocAL.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/tag-builds/rocm-examples.yml
+++ b/.azuredevops/tag-builds/rocm-examples.yml
@@ -1,29 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: checkoutRef
-  type: string
-  default: refs/tags/$(LATEST_RELEASE_TAG)
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-  - repository: release_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/rocm-examples
-    ref: ${{ parameters.checkoutRef }}
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/rocm-examples.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -9,63 +9,38 @@ parameters:
 - name: useDefaultBranch
  type: boolean
  default: true
- name: extractToMnt
-  type: boolean
-  default: false
 - name: defaultBranchList
  type: object
  default:
-    AMDMIGraphX: develop
    amdsmi: develop
-    aomp-extras: aomp-dev
    aomp: aomp-dev
+    aomp-extras: aomp-dev
+    AMDMIGraphX: develop
    clr: develop
    composable_kernel: develop
    half: master
    HIP: develop
    hipBLAS: develop
-    hipBLASLt: develop
-    hipCUB: develop
-    hipFFT: develop
-    hipfort: develop
-    HIPIFY: amd-staging
    hipRAND: develop
-    hipSOLVER: develop
    hipSPARSE: develop
-    hipSPARSELt: develop
-    hipTensor: develop
    llvm-project: amd-staging
    MIOpen: develop
-    MIVisionX: develop
-    rccl: develop
-    rdc: develop
-    rocAL: develop
-    rocALUTION: develop
    rocBLAS: develop
    ROCdbgapi : amd-master
    rocDecode: develop
    rocFFT: develop
-    rocgdb: amd-staging
    rocm-cmake: develop
-    rocm-core: master
-    rocm-examples: develop
-    rocminfo: amd-staging
-    rocMLIR: develop
-    ROCmValidationSuite: master
-    rocm_bandwidth_test: master
    rocm_smi_lib: develop
+    rocminfo: master
+    rocMLIR: develop
    rocPRIM: develop
    rocprofiler-register: amd-mainline
-    rocprofiler: amd-master
    ROCR-Runtime: master
    rocRAND: develop
-    rocr_debug_agent: amd-staging
    rocSOLVER: develop
    rocSPARSE: develop
    ROCT-Thunk-Interface: master
-    rocThrust: develop
    roctracer: amd-master
-    rocWMMA: develop
    rpp: master
 - name: componentsFailureOkay
  type: object
@@ -95,10 +70,7 @@ steps:
  displayName: Extract ${{ parameters.componentName }}
  inputs:
    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-    ${{ if parameters.extractToMnt }}:
-      destinationFolder: '/mnt/rocm'
-    ${{ else }}:
-      destinationFolder: '$(Agent.BuildDirectory)/rocm'
+    destinationFolder: '$(Agent.BuildDirectory)/rocm'
    cleanDestinationFolder: false
    overwriteExistingFiles: true
 - task: DeleteFiles@1
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -5,15 +5,9 @@ parameters:
 - name: extraBuildFlags
  type: string
  default: ''
- name: multithreadFlag
-  type: string
-  default: ''
 - name: cmakeBuildDir
  type: string
  default: 'build'
- name: cmakeSourceDir
-  type: string
-  default: '..'
 - name: cmakeTarget
  type: string
  default: 'install'
@@ -23,12 +17,6 @@ parameters:
 - name: installDir
  type: string
  default: '$(Build.BinariesDirectory)'
- name: customInstallPath
-  type: boolean
-  default: true
- name: installEnabled
-  type: boolean
-  default: true

 steps:
 # create workingDirectory if it does not exist and change into it
@@ -37,27 +25,19 @@ steps:
  displayName: '${{parameters.componentName }} CMake Flags'
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.customInstallPath, true) }}:
-      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
-    ${{ else }}:
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
- script: df -h
-  displayName: Disk space before build
+    cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ..
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
  displayName: '${{parameters.componentName }} Build'
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
+    cmakeArgs: '--build ${{ parameters.cmakeTargetDir }}'
    retryCountOnTaskFailure: 10
- script: df -h
-  displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
- ${{ if eq(parameters.installEnabled, true) }}:
-  - task: CMake@1
-    displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
-    inputs:
-      workingDirectory: ${{ parameters.cmakeBuildDir }}
-      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.cmakeTarget }}'
+- task: CMake@1
+  displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
+  inputs:
+    workingDirectory: ${{ parameters.cmakeBuildDir }}
+    cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.cmakeTarget }}'
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -12,31 +12,23 @@ steps:
  displayName: 'sudo apt-get update'
  inputs:
    targetType: inline
-    script: sudo apt-get --yes update
-  env:
-    DEBIAN_FRONTEND: noninteractive
+    script: sudo apt-get update
 - task: Bash@3
  displayName: 'sudo apt-get upgrade'
  inputs:
    targetType: inline
-    script: sudo apt-get --yes upgrade
-  env:
-    DEBIAN_FRONTEND: noninteractive
+    script: sudo apt-get update
 - task: Bash@3
  displayName: 'sudo apt-get fix'
  inputs:
    targetType: inline
    script: sudo apt --yes --fix-broken install
-  env:
-    DEBIAN_FRONTEND: noninteractive
 - ${{ if gt(length(parameters.aptPackages), 0) }}:
  - task: Bash@3
    displayName: 'sudo apt-get install ...'
    inputs:
      targetType: inline
      script: sudo apt-get --yes install ${{ join(' ', parameters.aptPackages) }}
-    env:
-      DEBIAN_FRONTEND: noninteractive
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -11,9 +11,6 @@ parameters:
    - staging
    - tag-builds
    - fixed
- name: extractToMnt
-  type: boolean
-  default: false
 # required values for fixed selection
 - name: fixedPipelineIdentifier
  type: string
@@ -26,112 +23,70 @@ parameters:
 - name: stagingPipelineIdentifiers
  type: object
  default:
-    AMDMIGraphX: $(amdmigraphx-pipeline-id)
    amdsmi: $(amdsmi-pipeline-id)
-    aomp-extras: $(aomp-extras-pipeline-id)
    aomp: $(aomp-pipeline-id)
+    aomp-extras: $(aomp-extras-pipeline-id)
+    AMDMIGraphX: $(amdmigraphx-pipeline-id)
    clr: $(clr-pipeline-id)
    composable_kernel: $(composable-kernel-pipeline-id)
    half: $(half-pipeline-id)
-    HIP: $(hip-pipeline-id)
    hipBLAS: $(hipblas-pipeline-id)
-    hipBLASLt: $(hipblaslt-pipeline-id)
-    hipCUB: $(hipcub-pipeline-id)
-    hipFFT: $(hipfft-pipeline-id)
-    hipfort: $(hipfort-pipeline-id)
    HIPIFY: $(hipify-pipeline-id)
    hipRAND: $(hiprand-pipeline-id)
-    hipSOLVER: $(hipsolver-pipeline-id)
    hipSPARSE: $(hipsparse-pipeline-id)
-    hipSPARSELt: $(hipsparselt-pipeline-id)
-    hipTensor: $(hiptensor-pipeline-id)
    llvm-project: $(llvm-project-pipeline-id)
    MIOpen: $(miopen-pipeline-id)
-    MIVisionX: $(mivisionx-pipeline-id)
-    rccl: $(rccl-pipeline-id)
-    rdc: $(rdc-pipeline-id)
-    rocAL: $(rocal-pipeline-id)
-    rocALUTION: $(rocalution-pipeline-id)
    rocBLAS: $(rocblas-pipeline-id)
    ROCdbgapi : $(rocdbgapi-pipeline-id)
    rocDecode: $(rocdecode-pipeline-id)
    rocFFT: $(rocfft-pipeline-id)
-    ROCgdb: $(rocgdb-pipeline-id)
    rocm-cmake: $(rocm-cmake-pipeline-id)
    rocm-core: $(rocm-core-pipeline-id)
-    rocm-examples: $(rocm-examples-pipeline-id)
+    rocm_smi_lib: $(rocm-smi-lib-pipeline-id)
    rocminfo: $(rocminfo-pipeline-id)
    rocMLIR: $(rocmlir-pipeline-id)
-    ROCmValidationSuite: $(rocmvalidationsuite-pipeline-id)
-    rocm_bandwidth_test: $(rocm-bandwidth-test-pipeline-id)
-    rocm_smi_lib: $(rocm-smi-lib-pipeline-id)
    rocPRIM: $(rocprim-pipeline-id)
    rocprofiler-register: $(rocprofiler-register-pipeline-id)
-    rocprofiler: $(rocprofiler-pipeline-id)
    ROCR-Runtime: $(rocr-runtime-pipeline-id)
    rocRAND: $(rocrand-pipeline-id)
-    rocr_debug_agent: $(rocr-debug-agent-pipeline-id)
    rocSOLVER: $(rocsolver-pipeline-id)
    rocSPARSE: $(rocsparse-pipeline-id)
    ROCT-Thunk-Interface: $(roct-thunk-interface-pipeline-id)
-    rocThrust: $(rocthrust-pipeline-id)
    roctracer: $(roctracer-pipeline-id)
-    rocWMMA: $(rocwmma-pipeline-id)
    rpp: $(rpp-pipeline-id)
 - name: taggedPipelineIdentifiers
  type: object
  default:
-    AMDMIGraphX: $(amdmigraphx-tagged-pipeline-id)
    amdsmi: $(amdsmi-tagged-pipeline-id)
-    aomp-extras: $(aomp-extras-tagged-pipeline-id)
    aomp: $(aomp-tagged-pipeline-id)
+    aomp-extras: $(aomp-extras-tagged-pipeline-id)
+    AMDMIGraphX: $(amdmigraphx-tagged-pipeline-id)
    clr: $(clr-tagged-pipeline-id)
    composable_kernel: $(composable-kernel-tagged-pipeline-id)
    half: $(half-tagged-pipeline-id)
-    HIP: $(hip-tagged-pipeline-id)
    hipBLAS: $(hipblas-tagged-pipeline-id)
-    hipBLASLt: $(hipblaslt-tagged-pipeline-id)
-    hipCUB: $(hipcub-tagged-pipeline-id)
-    hipFFT: $(hipfft-tagged-pipeline-id)
-    hipfort: $(hipfort-tagged-pipeline-id)
    HIPIFY: $(hipify-tagged-pipeline-id)
    hipRAND: $(hiprand-tagged-pipeline-id)
-    hipSOLVER: $(hipsolver-tagged-pipeline-id)
    hipSPARSE: $(hipsparse-tagged-pipeline-id)
-    hipSPARSELt: $(hipsparselt-tagged-pipeline-id)
-    hipTensor: $(hiptensor-tagged-pipeline-id)
    llvm-project: $(llvm-project-tagged-pipeline-id)
    MIOpen: $(miopen-tagged-pipeline-id)
-    MIVisionX: $(mivisionx-tagged-pipeline-id)
-    rccl: $(rccl-tagged-pipeline-id)
-    rdc: $(rdc-tagged-pipeline-id)
-    rocAL: $(rocal-tagged-pipeline-id)
-    rocALUTION: $(rocalution-tagged-pipeline-id)
    rocBLAS: $(rocblas-tagged-pipeline-id)
    ROCdbgapi : $(rocdbgapi-tagged-pipeline-id)
    rocDecode: $(rocdecode-tagged-pipeline-id)
    rocFFT: $(rocfft-tagged-pipeline-id)
-    ROCgdb: $(rocgdb-tagged-pipeline-id)
    rocm-cmake: $(rocm-cmake-tagged-pipeline-id)
    rocm-core: $(rocm-core-tagged-pipeline-id)
-    rocm-examples: $(rocm-examples-tagged-pipeline-id)
+    rocm_smi_lib: $(rocm-smi-lib-tagged-pipeline-id)
    rocminfo: $(rocminfo-tagged-pipeline-id)
    rocMLIR: $(rocmlir-tagged-pipeline-id)
-    ROCmValidationSuite: $(rocmvalidationsuite-tagged-pipeline-id)
-    rocm_bandwidth_test: $(rocm-bandwidth-test-tagged-pipeline-id)
-    rocm_smi_lib: $(rocm-smi-lib-tagged-pipeline-id)
    rocPRIM: $(rocprim-tagged-pipeline-id)
    rocprofiler-register: $(rocprofiler-register-tagged-pipeline-id)
-    rocprofiler: $(rocprofiler-tagged-pipeline-id)
    ROCR-Runtime: $(rocr-runtime-tagged-pipeline-id)
    rocRAND: $(rocrand-tagged-pipeline-id)
-    rocr_debug_agent: $(rocr-debug-agent-tagged-pipeline-id)
    rocSOLVER: $(rocsolver-tagged-pipeline-id)
    rocSPARSE: $(rocsparse-tagged-pipeline-id)
    ROCT-Thunk-Interface: $(roct-thunk-interface-tagged-pipeline-id)
-    rocThrust: $(rocthrust-tagged-pipeline-id)
    roctracer: $(roctracer-tagged-pipeline-id)
-    rocWMMA: $(rocwmma-tagged-pipeline-id)
    rpp: $(rpp-tagged-pipeline-id)
 # set to true if you're calling this template file multiple files in same pipeline
 # only leave last call false to optimize sequence
@@ -147,45 +102,31 @@ steps:
      parameters:
        componentName: ${{ dependency }}
        pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
-        extractToMnt: ${{ parameters.extractToMnt }}
  - ${{ if eq(parameters.dependencySource, 'tag-builds') }}:
    - template: artifact-download.yml
      parameters:
        componentName: ${{ dependency }}
        pipelineId: ${{ parameters.taggedPipelineIdentifiers[dependency] }}
-        extractToMnt: ${{ parameters.extractToMnt }}
 # fixed case only accepts one component at a time, so no array input
 - ${{ if eq(parameters.dependencySource, 'fixed') }}:
  - template: artifact-download.yml
    parameters:
      componentName: ${{ parameters.fixedComponentName }}
      pipelineId: ${{ parameters.fixedPipelineIdentifier }}
-      extractToMnt: ${{ parameters.extractToMnt }}
 - task: Bash@3
  displayName: 'list downloaded ROCm files'
  inputs:
    targetType: inline
-    ${{ if eq(parameters.extractToMnt, true) }}:
-      script: ls -1R /mnt/rocm
-    ${{ else }}:
-      script: ls -1R $(Agent.BuildDirectory)/rocm
+    script: ls -1R $(Agent.BuildDirectory)/rocm
 - ${{ if eq(parameters.skipLibraryLinking, false) }}:
  - task: Bash@3
    displayName: 'link ROCm shared libraries'
    inputs:
      targetType: inline
 # OS ignores if the ROCm lib folder shows up more than once
-      ${{ if eq(parameters.extractToMnt, true) }}:
-        script: |
-          echo /mnt/rocm/lib | sudo tee -a /etc/ld.so.conf
-          echo /mnt/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
-          sudo cat /etc/ld.so.conf
-          sudo ldconfig -v
-          ldconfig -p
-      ${{ else }}:
-        script: |
-          echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf
-          echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
-          sudo cat /etc/ld.so.conf
-          sudo ldconfig -v
-          ldconfig -p
+      script: |
+        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf
+        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf
+        sudo cat /etc/ld.so.conf
+        sudo ldconfig -v
+        ldconfig -p
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -21,8 +21,6 @@ variables:
  value: rocm-ci_ultra_build_pool
 - name: ON_PREM_BUILD_POOL
  value: rocm-ci_build_pool
- name: LARGE_DISK_BUILD_POOL
-  value: rocm-ci_larger_base_disk_pool
 - name: LATEST_RELEASE_TAG
  value: rocm-6.1.0
 - name: DOCKER_IMAGE_NAME
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,19 +3,20 @@

 version: 2

+sphinx:
+   configuration: docs/conf.py
+
+formats: [htmlzip]
+
+python:
+   install:
+   - requirements: docs/sphinx/requirements.txt
+
 build:
   os: ubuntu-22.04
   tools:
      python: "3.10"
   apt_packages:
     - "doxygen"
+     - "gfortran" # For pre-processing fortran sources
     - "graphviz" # For dot graphs in doxygen
-
-python:
-   install:
-   - requirements: docs/sphinx/requirements.txt
-
-sphinx:
-   configuration: docs/conf.py
-
-formats: []
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -2,7 +2,6 @@ AAC
 ABI
 ACE
 ACEs
-ACS
 AccVGPR
 AccVGPRs
 ALU
@@ -13,7 +12,6 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
-APBDIS
 APIC
 APIs
 APU
@@ -26,13 +24,11 @@ ATI
 AddressSanitizer
 AlexNet
 Arb
-BARs
 BLAS
 BMC
 BitCode
 Blit
 Bluefield
-Bootloader
 CCD
 CDNA
 CIFAR
@@ -47,7 +43,6 @@ CPF
 CPP
 CPU
 CPUs
-Cron
 CSC
 CSE
 CSV
@@ -67,10 +62,7 @@ CommonMark
 Concretized
 Conda
 ConnectX
-DDR
-DF
 DGEMM
-DIMM
 DKMS
 DL
 DMA
@@ -99,9 +91,7 @@ FFmpeg
 FHS
 FMA
 FP
-FX
 Filesystem
-FindDb
 Flang
 Fortran
 Fuyu
@@ -134,7 +124,6 @@ GitHub
 Gitpod
 HBM
 HCA
-HGX
 HIPCC
 HIPExtension
 HIPIFY
@@ -144,14 +133,12 @@ HPE
 HPL
 HSA
 HWE
-HWS
 Haswell
 Higgs
 Hyperparameters
 ICV
 IDE
 IDEs
-IFWI
 IMDb
 IOMMU
 IOP
@@ -161,7 +148,6 @@ IRQ
 ISA
 ISV
 ISVs
-ITL
 ImageNet
 InfiniBand
 Inlines
@@ -173,7 +159,6 @@ JSON
 Jupyter
 KFD
 KiB
-KV
 KVM
 Keras
 Khronos
@@ -208,7 +193,6 @@ MVFFR
 Makefile
 Makefiles
 Matplotlib
-Megatrends
 Megatron
 Mellanox
 Mellanox's
@@ -224,7 +208,6 @@ NIC
 NICs
 NLI
 NLP
-NPKit
 NPS
 NSP
 NUMA
@@ -254,22 +237,18 @@ OpenCV
 OpenFabrics
 OpenGL
 OpenMP
-OpenMPI
 OpenSSL
 OpenVX
-PCC
 PCI
 PCIe
 PEFT
 PIL
 PILImage
-POR
 PRNG
 PRs
 PaLM
 Pageable
 PeerDirect
-PerfDb
 Perfetto
 PipelineParallel
 PnP
@@ -308,7 +287,6 @@ SBIOS
 SCA
 SDK
 SDMA
-SDPA
 SDRAM
 SENDMSG
 SGPR
@@ -330,12 +308,10 @@ SRAMECC
 SVD
 SWE
 SerDes
-ShareGPT
 Shlens
 Skylake
 Softmax
 Spack
-SplitK
 Supermicro
 Szegedy
 TCA
@@ -346,12 +322,8 @@ TCP
 TCR
 TF
 TFLOPS
-TP
 TPU
 TPUs
-TSME
-Tagram
-TensileLite
 TensorBoard
 TensorFlow
 TensorParallel
@@ -372,7 +344,6 @@ USM
 UTCL
 UTIL
 Uncached
-Unittests
 Unhandled
 VALU
 VBIOS
@@ -461,7 +432,6 @@ cuLIB
 cuRAND
 cuSOLVER
 cuSPARSE
-cTDP
 dataset
 datasets
 dataspace
@@ -496,7 +466,6 @@ executables
 ffmpeg
 filesystem
 fortran
-fp
 galb
 gcc
 gdb
@@ -510,7 +479,6 @@ gzip
 heterogenous
 hipBLAS
 hipBLASLt
-hipBLASLt's
 hipCUB
 hipFFT
 hipLIB
@@ -527,8 +495,6 @@ hipfort
 hipify
 hipsolver
 hipsparse
-hotspotting
-hpc
 hpp
 hsa
 hsakmt
@@ -536,7 +502,6 @@ hyperparameter
 ib_core
 inband
 incrementing
-inductor
 inferencing
 inflight
 init
@@ -594,8 +559,6 @@ prebuilt
 precompiled
 prefetch
 prefetchable
-prefill
-prefills
 preprocess
 preprocessed
 preprocessing
@@ -668,7 +631,6 @@ subexpression
 subfolder
 subfolders
 supercomputing
-td
 tensorfloat
 th
 tokenization
@@ -720,8 +682,7 @@ writebacks
 wrreq
 wzo
 xargs
-xGMI
 xz
 yaml
 ysvmadyb
-zypper
+zypper
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -76,8 +76,8 @@ The Build time will reduce significantly if we limit the GPU Architecture/s agai

 mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
 cd ~/WORKSPACE/
-export ROCM_VERSION=6.1.0   # or 6.1.1 6.1.2
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.1.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
+export ROCM_VERSION=6.1.0   # or 6.1.1
+~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.1.x -m rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync

 # --------------------------------------
@@ -86,9 +86,9 @@ export ROCM_VERSION=6.1.0   # or 6.1.1 6.1.2

 # Option 1: Start a docker container
 # Pulling required base docker images:
-# Ubuntu20.04 built from ROCm/tools/rocm-build/docker/ubuntu20/Dockerfile
+# Ubuntu20.04 built from ROCm/rocm-build/docker/ubuntu20/Dockerfile
 docker pull rocm/rocm-build-ubuntu-20.04:6.1
-# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
+# Ubuntu22.04 built from ROCm/rocm-build/docker/ubuntu22/Dockerfile
 docker pull rocm/rocm-build-ubuntu-22.04:6.1

 # Start docker container and mount the source code folder:
@@ -107,10 +107,10 @@ docker run -ti \

 # Option 2: Install required packages into the host machine
 # For ubuntu20.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu20
+cd ROCm/rocm-build/docker/ubuntu20
 bash install-prerequisites.sh
 # For ubuntu22.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu22
+cd ROCm/rocm-build/docker/ubuntu22
 bash install-prerequisities.sh

 # --------------------------------------
@@ -126,13 +126,13 @@ export GPU_ARCHS="gfx940;gfx941;gfx942" # Example

 # Pick and run build commands in the docker container:
 # Build rocm-dev packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
+make -f ROCm/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
 # Build all ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
+make -f ROCm/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
 # list all ROCm components to find required components
-make -f ROCm/tools/rocm-build/ROCm.mk list_components
+make -f ROCm/rocm-build/ROCm.mk list_components
 # Build a single ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
+make -f ROCm/rocm-build/ROCm.mk T_rocblas

 # Find built packages in ubuntu20.04:
 out/ubuntu-20.04/20.04/deb/
@@ -151,7 +151,7 @@ out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
 out/ubuntu-22.04/22.04/logs/rocblas             # Example
 ```

-Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
+Note: [Overview for ROCm.mk](rocm-build/README.md)

 ## ROCm documentation

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,4 @@
-# ROCm 6.1.5 release notes
+# ROCm 6.1.2 release notes
 <!-- Do not edit this file! This file is autogenerated with -->
 <!--   tools/autotag/tag_script.py                          -->

@@ -11,369 +11,136 @@

 <!-- spellcheck-disable -->

-The release notes provide a summary of notable changes since the previous ROCm release.
+ROCm 6.1.2 includes enhancements to SMI tools and improvements to some libraries.

- [Release highlights](#release-highlights)
+### OS support

- [Operating system support](#operating-system-support)
+ROCm 6.1.2 has been tested against a pre-release version of Ubuntu 22.04.5 (kernel: 5.15 [GA], 6.8 [HWE]).

- [ROCm components versioning](#rocm-components)
+### AMD SMI

- [ROCm known issues](#rocm-known-issues)
+AMD SMI for ROCm 6.1.2

- [ROCm upcoming changes](#rocm-upcoming-changes)
+#### Additions
+
+* Added process isolation and clean shader APIs and CLI commands.
+  * `amdsmi_get_gpu_process_isolation()`
+  * `amdsmi_set_gpu_process_isolation()`
+  * `amdsmi_set_gpu_clear_sram_data()`
+* Added the `MIN_POWER` metric to output provided by `amd-smi static --limit`.
+
+#### Optimizations
+
+* Updated the `amd-smi monitor --pcie` output to prevent delays with the `monitor` command.
+
+#### Changes
+
+* Updated `amismi_get_power_cap_info` to return values in uW instead of W.
+* Updated Python library return types for `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info`.
+* Updated the output of `amd-smi metric --ecc-blocks` to show counters available from blocks.
+
+#### Fixes
+
+* `amdsmi_get_gpu_board_info()` no longer returns junk character strings.
+* `amd-smi metric --power` now correctly details power output for RDNA3, RDNA2, and MI1x devices.
+* Fixed the `amdsmitstReadWrite.TestPowerCapReadWrite` test for RDNA3, RDNA2, and MI100 devices.
+* Fixed an issue with the `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info` Python interface calls.
+
+#### Removals
+
+* Removed the `amdsmi_get_gpu_process_info` API from the Python library. It was removed from the C library in an earlier release.

 ```{note}
-If you’re using Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/compatibility/native_linux/native_linux_compatibility.html)
-documentation to verify compatibility and system requirements.
+See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.
 ```

-## Release highlights
+### ROCm SMI

-The following is the notable improvement in ROCm 6.1.5.
+ROCm SMI for ROCm 6.1.2

-### Fixed compatibility issue with third-party profiling tools using ROCprofiler-SDK backend
+#### Additions

-[rocprofiler-register](https://github.com/ROCm/rocprofiler-register) library has resolved the profiling tools compatibility issue where applications potentially failed with the error message `rocprofiler_configure not found. Tried to dlopen`. This prevents a failure when profiling ROCm 6.1 applications using third-party profiling tools upgraded to use [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/) in ROCm 6.2.0 or later. 
+* Added the ring hang event to the `amdsmi_evt_notification_type_t` enum.

-## Operating system support
+#### Fixes

-ROCm 6.1.5 no longer supports CentOS 7.9. All other operating system support from ROCm 6.1.2 remains unchanged in this release. 
+* Fixed an issue causing ROCm SMI to incorrectly report GPU utilization for RDNA3 GPUs. See the issue on [GitHub](https://github.com/ROCm/ROCm/issues/3112).
+* Fixed the parsing of `pp_od_clk_voltage` in `get_od_clk_volt_info` to work better with MI-series hardware.

-ROCm 6.1.5 requires the [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.1.5/how-to/native-install/index.html).
+## Library changes in ROCm 6.1.2

-See the [Compatibility
-matrix](../../docs/compatibility/compatibility-matrix.rst)
-for more information about operating system compatibility.
+| Library | Version |
+|---------|---------|
+| AMDMIGraphX | [2.9](https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.1.2) |
+| composable_kernel | [0.2.0](https://github.com/ROCm/composable_kernel/releases/tag/rocm-6.1.2) |
+| hipBLAS | [2.1.0](https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.1.2) |
+| hipBLASLt | [0.7.0](https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.1.2) |
+| hipCUB | [3.1.0](https://github.com/ROCm/hipCUB/releases/tag/rocm-6.1.2) |
+| hipFFT | [1.0.14](https://github.com/ROCm/hipFFT/releases/tag/rocm-6.1.2) |
+| hipRAND | [2.10.17](https://github.com/ROCm/hipRAND/releases/tag/rocm-6.1.2) |
+| hipSOLVER | [2.1.1](https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.1.2) |
+| hipSPARSE | [3.0.1](https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.1.2) |
+| hipSPARSELt | [0.2.0](https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.1.2) |
+| hipTensor | [1.2.0](https://github.com/ROCm/hipTensor/releases/tag/rocm-6.1.2) |
+| MIOpen | [3.1.0](https://github.com/ROCm/MIOpen/releases/tag/rocm-6.1.2) |
+| MIVisionX | [2.5.0](https://github.com/ROCm/MIVisionX/releases/tag/rocm-6.1.2) |
+| rccl | [2.18.6](https://github.com/ROCm/rccl/releases/tag/rocm-6.1.2) |
+| rocALUTION | [3.1.1](https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.1.2) |
+| rocBLAS | 4.1.0 ⇒ [4.1.2](https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.1.2) |
+| rocDecode | 0.5.0 ⇒ [0.6.0](https://github.com/ROCm/rocDecode/releases/tag/rocm-6.1.2) |
+| rocFFT | [1.0.27](https://github.com/ROCm/rocFFT/releases/tag/rocm-6.1.2) |
+| rocm-cmake | [0.12.0](https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.1.2) |
+| rocPRIM | [3.1.0](https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.1.2) |
+| rocRAND | [3.0.1](https://github.com/ROCm/rocRAND/releases/tag/rocm-6.1.2) |
+| rocSOLVER | [3.25.0](https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.1.2) |
+| rocSPARSE | [3.1.2](https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.1.2) |
+| rocThrust | [3.0.1](https://github.com/ROCm/rocThrust/releases/tag/rocm-6.1.2) |
+| rocWMMA | [1.4.0](https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.1.2) |
+| rpp | [1.5.0](https://github.com/ROCm/rpp/releases/tag/rocm-6.1.2) |
+| Tensile | [4.40.0](https://github.com/ROCm/Tensile/releases/tag/rocm-6.1.2) |

-## ROCm components
+### RCCL

-The following table lists the versions of ROCm components for ROCm 6.1.5. 
-Click {fab}`github` to go to the component's source code on GitHub.
+RCCL 2.18.6 for ROCm 6.1.2

-<div class="pst-scrollable-table-container">
-    <table id="rocm-rn-components" class="table">
-        <thead>
-            <tr>
-                <th>Category</th>
-                <th>Group</th>
-                <th>Name</th>
-                <th>Version</th>
-                <th></th>
-            </tr>
-        </thead>
-        <colgroup>
-            <col span="1">
-            <col span="1">
-        </colgroup>
-        <tbody class="rocm-components-libs rocm-components-ml">
-            <tr>
-                <th rowspan="6">Libraries</th>
-                <th rowspan="6">Machine learning and computer vision</th>
-                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.1.5">Composable Kernel</a>
-                </td>
-                <td>1.1.0</td>
-                <td><a href="https://github.com/ROCm/composable_kernel/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.1.5">MIGraphX</a></td>
-                <td>2.9</td>
-                <td><a href="https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.1.5"><i class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.1.5">MIOpen</a></td>
-                <td>3.1.0</td>
-                <td><a href="https://github.com/ROCm/MIOpen/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.1.5">MIVisionX</a></td>
-                <td>2.5.0</td>
-                <td><a href="https://github.com/ROCm/MIVisionX/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.1.5">rocDecode</a></td>
-                <td>0.6.0</td>
-                <td><a href="https://github.com/ROCm/rocDecode/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.1.5">RPP</a></td>
-                <td>1.5.0</td>
-                <td><a href="https://github.com/ROCm/rpp/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-libs rocm-components-communication">
-            <tr>
-                <th rowspan="1"></th>
-                <th rowspan="1">Communication</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.1.5">RCCL</a></td>
-                <td>2.18.6</td>
-                <td><a href="https://github.com/ROCm/rccl/releases/tag/rocm-6.1.5"><i class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-libs rocm-components-math tbody-reverse-zebra">
-            <tr>
-                <th rowspan="16"></th>
-                <th rowspan="16">Math</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.1.5">hipBLAS</a></td>
-                <td>2.1.0</td>
-                <td><a href="https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.1.5">hipBLASLt</a></td>
-                <td>0.7.0</td>
-                <td><a href="https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.1.5">hipFFT</a></td>
-                <td>1.0.14</td>
-                <td><a href="https://github.com/ROCm/hipFFT/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.1.5">hipfort</a></td>
-                <td>0.4.0</td>
-                <td><a href="https://github.com/ROCm/hipfort/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.1.5">hipRAND</a></td>
-                <td>2.10.16</td>
-                <td><a href="https://github.com/ROCm/hipRAND/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.1.5">hipSOLVER</a></td>
-                <td>2.1.1</td>
-                <td><a href="https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.1.5">hipSPARSE</a></td>
-                <td>3.0.1</td>
-                <td><a href="https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.1.5">hipSPARSELt</a></td>
-                <td>0.2.0</td>
-                <td><a href="https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.1.5">rocALUTION</a></td>
-                <td>3.1.1</td>
-                <td><a href="https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.1.5">rocBLAS</a></td>
-                <td>4.1.2</td>
-                <td><a href="https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.1.5">rocFFT</a></td>
-                <td>1.0.27</td>
-                <td><a href="https://github.com/ROCm/rocFFT/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.1.5">rocRAND</a></td>
-                <td>3.0.1</td>
-                <td><a href="https://github.com/ROCm/rocRAND/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.1.5">rocSOLVER</a></td>
-                <td>3.25.0</td>
-                <td><a href="https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.1.5">rocSPARSE</a></td>
-                <td>3.1.2</td>
-                <td><a href="https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.1.5">rocWMMA</a></td>
-                <td>1.4.0</td>
-                <td><a href="https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://github.com/ROCm/tensile/">Tensile</a></td>
-                <td>4.40.0</td>
-                <td><a href="https://github.com/ROCm/tensile/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-libs rocm-components-primitives tbody-reverse-zebra">
-            <tr>
-                <th rowspan="4"></th>
-                <th rowspan="4">Primitives</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.1.5">hipCUB</a></td>
-                <td>3.1.0</td>
-                <td><a href="https://github.com/ROCm/hipCUB/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.1.5">hipTensor</a></td>
-                <td>1.2.0</td>
-                <td><a href="https://github.com/ROCm/hipTensor/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.1.5">rocPRIM</a></td>
-                <td>3.1.0</td>
-                <td><a href="https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.1.5">rocThrust</a></td>
-                <td>3.0.1</td>
-                <td><a href="https://github.com/ROCm/rocThrust/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-tools rocm-components-system tbody-reverse-zebra">
-            <tr>
-                <th rowspan="5">Tools</th>
-                <th rowspan="5">System management</th>
-                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.1.5">AMD SMI</a></td>
-                <td>24.5.1</td>
-                <td><a href="https://github.com/ROCm/amdsmi/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.1.5">rocminfo</a></td>
-                <td>1.0.0</td>
-                <td><a href="https://github.com/ROCm/rocminfo/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.1.5">ROCm Data Center Tool</a></td>
-                <td>0.3.0</td>
-                <td><a href="https://github.com/ROCm/rdc/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.1.5">ROCm SMI</a></td>
-                <td>7.2.0</td>
-                <td><a href="https://github.com/ROCm/rocm_smi_lib/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.1.5">ROCm Validation Suite</a></td>
-                <td>1.0.0</td>
-                <td><a href="https://github.com/ROCm/ROCmValidationSuite/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-tools rocm-components-perf">
-            <tr>
-                <th rowspan="3"></th>
-                <th rowspan="3">Performance</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.1.5">ROCm Bandwidth
-                        Test</a></td>
-                <td>1.4.0</td>
-                <td><a href="https://github.com/ROCm/rocm_bandwidth_test/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.1.5/">ROCProfiler</a></td>
-                <td>2.0.0</td>
-                <td><a href="https://github.com/ROCm/ROCProfiler/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr >
-                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.1.5/">ROCTracer</a></td>
-                <td>4.1.0</td>
-                <td><a href="https://github.com/ROCm/ROCTracer/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-tools rocm-components-dev tbody-reverse-zebra">
-            <tr>
-                <th rowspan="5"></th>
-                <th rowspan="5">Development</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.1.5/">HIPIFY</a></td>
-                <td>17.0.0</td>
-                <td><a href="https://github.com/ROCm/HIPIFY/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.1.5">ROCdbgapi</a></td>
-                <td>0.71.0</td>
-                <td><a href="https://github.com/ROCm/ROCdbgapi/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.1.5/">ROCm CMake</a></td>
-                <td>0.12.0</td>
-                <td><a href="https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.1.5">ROCm Debugger (ROCgdb)</a>
-                </td>
-                <td>14.1</td>
-                <td><a href="https://github.com/ROCm/ROCgdb/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.1.5">ROCr Debug Agent</a>
-                </td>
-                <td>2.0.3</td>
-                <td><a href="https://github.com/ROCm/rocr_debug_agent/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-compilers">
-            <tr>
-                <th rowspan="2" colspan="2">Compilers</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.1.5">HIPCC</a></td>
-                <td>1.0.0</td>
-                <td><a href="https://github.com/ROCm/llvm-project/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://github.com/ROCm/llvm-project/">llvm-project</a></td>
-                <td>17.0.0</td>
-                <td><a href="https://github.com/ROCm/llvm-project/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-        <tbody class="rocm-components-runtimes">
-            <tr>
-                <th rowspan="2" colspan="2">Runtimes</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.1.5">HIP</a></td>
-                <td>6.1</td>
-                <td><a href="https://github.com/ROCm/HIP/releases/tag/rocm-6.1.5"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.1.5">ROCr Runtime</a></td>
-                <td>1.13.0</td>
-                <td><a href="https://github.com/ROCm/ROCR-Runtime"><i
-                            class="fab fa-github fa-lg"></i></a></td>
-            </tr>
-        </tbody>
-    </table>
-</div>
+#### Changes

-## ROCm known issues
+* Reduced `NCCL_TOPO_MAX_NODES` to limit stack usage and avoid stack overflow.

-ROCm known issues are tracked on [GitHub](https://github.com/ROCm/ROCm/labels/Verified%20Issue).
+### rocBLAS

-## ROCm upcoming changes
+rocBLAS 4.1.2 for ROCm 6.1.2
+
+#### Optimizations
+
+* Tuned BBS TN and TT operations on the CDNA3 architecture.
+
+#### Fixes
+
+* Fixed an issue related to obtaining solutions for BF16 TT operations.
+
+### rocDecode
+
+rocDecode 0.6.0 for ROCm 6.1.2
+
+#### Additions
+
+* Added support for FFmpeg v5.x.
+
+#### Optimizations
+
+* Updated error checking in the `rocDecode-setup.py` script.
+
+#### Changes
+
+* Updated core dependencies.
+* Updated to support the use of public LibVA headers.
+
+#### Fixes
+
+* Fixed some package dependencies.
+
+## Upcoming changes

 * A future release will enable the use of HIPCC compiled binaries `hipcc.bin` and `hipconfig.bin` by default. No action is needed by users; you may continue calling high-level Perl scripts `hipcc` and `hipconfig`. `hipcc.bin` and `hipconfig.bin` will be invoked by the high-level Perl scripts. To revert to the previous behavior and invoke `hipcc.pl` and `hipconfig.pl`, set the `HIP_USE_PERL_SCRIPTS` environment variable to `1`.
 * A subsequent release will remove high-level HIPCC Perl scripts from `hipcc` and `hipconfig`. This release will remove the `HIP_USE_PERL_SCRIPTS` environment variable. It will rename `hipcc.bin` and `hipconfig.bin` to `hipcc` and `hipconfig` respectively. No action is needed by the users. To revert to the previous behavior, invoke `hipcc.pl` and `hipconfig.pl` explicitly.
--- a/docs/about/compatibility/openmp.md
+++ b/docs/about/compatibility/openmp.md
@@ -77,7 +77,8 @@ Obtain the value of `gpu-arch` by running the following command:

 [//]: # (dated link below, needs updating)

-See the complete list of [compiler command-line references](https://github.com/ROCm/llvm-project/blob/amd-staging/openmp/docs/CommandLineArgumentReference.rst).
+See the complete list of compiler command-line references
+[here](https://github.com/ROCm/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst).

 ### Using `rocprof` with OpenMP

--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -17,7 +17,7 @@ following section.

 ## ROCm component licenses

-ROCm is released by Advanced Micro Devices, Inc. (AMD) and is licensed per component separately.
+ROCm is released by Advanced Micro Devices, Inc. and is licensed per component separately.
 The following table is a list of ROCm components with links to their respective license
 terms. These components may include third party components subject to
 additional licenses. Please review individual repositories for more information.
@@ -25,71 +25,66 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
-| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
-| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
 | [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
-| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
-| [AMD Common Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/develop/LICENCE) |
-| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
-| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
-| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
-| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
-| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
-| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
-| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
-| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
-| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
-| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
+| [HIPCC](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) |
+| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
+| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
+| [MIOpenGEMM](https://github.com/ROCm/MIOpenGEMM/) | [MIT](https://github.com/ROCm/MIOpenGEMM/blob/master/LICENSE.txt) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/master/LICENSE.txt) |
+| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/master/LICENSE.txt) |
+| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
 | [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
 | [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
-| [ROCR Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
-| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
-| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
+| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
+| [ROCclr](https://github.com/ROCm/ROCclr/) | [MIT](https://github.com/ROCm/ROCclr/blob/develop/LICENSE.txt) |
+| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-master/LICENSE.txt) |
+| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
+| [ROCm-CompilerSupport](https://github.com/ROCm/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
+| [ROCm-Device-Libs](https://github.com/ROCm/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
+| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/ROCm/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
+| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
+| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
+| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
+| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
+| [atmi](https://github.com/ROCm/atmi/) | [MIT](https://github.com/ROCm/atmi/blob/master/LICENSE.txt) |
+| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
+| [flang](https://github.com/ROCm/flang/) | [Apache 2.0](https://github.com/ROCm/flang/blob/master/LICENSE.txt) |
+| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/master/LICENSE.txt) |
 | [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
-| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
 | [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
 | [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
-| [hipFORT](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
-| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
 | [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
-| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
 | [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
+| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
 | [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
-| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
+| [hipamd](https://github.com/ROCm/hipamd/) | [MIT](https://github.com/ROCm/hipamd/blob/develop/LICENSE.txt) |
+| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/master/LICENSE) |
+| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/main/LICENSE.TXT) |
+| [rccl](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
+| [rdc](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/master/LICENSE) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
 | [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
-| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
 | [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
-| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
 | [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
 | [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
 | [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
 | [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
-| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
-| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
-| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
-| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
-| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
-| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
-| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
-| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
-| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
-| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
-| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html)
+| [rocm-cmake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
+| [rocm_bandwidth_test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [rocm_smi_lib](https://github.com/ROCm/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_smi_lib/blob/master/License.txt) |
+| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/master/License.txt) |
+| [rocprofiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
+| [rocr_debug_agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/master/LICENSE.txt) |
+| [roctracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
+| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)

 Open sourced ROCm components are released via public GitHub
-repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
-Proprietary products are only available on [https://repo.radeon.com](https://repo.radeon.com). Currently, only
-one component of ROCm, `rocm-llvm-alt` is governed by a proprietary license.
+repositories, packages on https://repo.radeon.com and other distribution channels.
+Proprietary products are only available on https://repo.radeon.com. Currently, only
+one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
 Proprietary components are organized in a proprietary subdirectory in the package
 repositories to distinguish from open sourced packages.

@@ -97,7 +92,7 @@ repositories to distinguish from open sourced packages.
 The following additional terms and conditions apply to your use of ROCm technical documentation.
 ```

-©2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+©2023 Advanced Micro Devices, Inc. All rights reserved.

 The information presented in this document is for informational purposes only
 and may contain technical inaccuracies, omissions, and typographical errors. The
@@ -130,8 +125,8 @@ companies.

 :::{attention}
 AQL Profiler and AOCC CPU optimization are both provided in binary form, each
-subject to the license agreement enclosed in the directory for the binary available
-in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
+subject to the license agreement enclosed in the directory for the binary and is
+available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
 copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
 the terms and conditions of this license agreement. If you do not agree to the
 terms of this agreement, do not install, copy or use the AQL Profiler and/or the
@@ -139,8 +134,9 @@ AOCC CPU Optimizations.
 :::

 For the rest of the ROCm packages, you can find the licensing information at the
-following location: `/opt/rocm/share/doc/<component-name>/` or in the locations
-specified in the preceding table.
+following location: `/opt/rocm/share/doc/<component-name>/`

-For example, you can fetch the licensing information of the `amd_comgr`
-component (Code Object Manager) from the `/opt/rocm/share/doc/amd_comgr/LICENSE.txt` file.
+For example, you can fetch the licensing information of the `_amd_comgr_`
+component (Code Object Manager) from the `amd_comgr` folder. A file named
+`LICENSE.txt` contains the license details at:
+`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -8,121 +8,119 @@ Compatibility matrix

 Use this matrix to view the ROCm compatibility across successive major and minor releases.

+
 .. container:: format-big-table

  .. csv-table:: 
-      :header: "ROCm Version", "6.1.5", "6.1.2", "6.0.0"
+      :header: "ROCm Version", "6.1.0", "6.0.0"
      :stub-columns: 1

-      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`,"Ubuntu 22.04.5 [#Ubuntu220405]_, 22.04.4, 22.04.3","Ubuntu 22.04.5 [#Ubuntu220405]_, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
-      ,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
-      ,"RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,"SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,CentOS 7.9,CentOS 7.9
-      ,Oracle Linux 8.9 [#oracle89]_,Oracle Linux 8.9 [#oracle89]_,
-      ,.. _architecture-support-compatibility-matrix,,
-      :doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA
-      ,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix,,
-      :doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030
-      , gfx942 [#mi300_612]_, gfx942 [#mi300_612]_, gfx942 [#mi300_600]_
-      ,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908
-      ,,,
-      ECOSYSTEM SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <rocm-install-on-linux:install/3rd-party/tensorflow-install>`,"2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <rocm-install-on-linux:install/3rd-party/jax-install>`,0.4.26,0.4.26,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.17.3,1.14.1
-      ,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.1.0,2.1.0,2.0.1
-      CUB,2.1.0,2.1.0,2.0.1
-      ,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.9.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.1.0,3.1.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,2.5.0,2.5.0,2.5.0
-      :doc:`rocDecode <rocdecode:index>`,0.6.0,0.6.0,N/A
-      :doc:`RPP <rpp:index>`,1.5.0,1.5.0,1.4.0
-      ,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.18.6,2.18.6,2.18.3
-      ,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.1.0,2.1.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.7.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.14,1.0.13
-      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.1.1,2.1.1,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.1,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.0,0.2.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.1.1,3.1.1,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.1.2,4.1.2,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.27,1.0.27,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.0.1,3.0.1,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.25.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.1.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.4.0,1.4.0,1.3.0
-      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.40.0,4.39.0
-      ,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,3.1.0,3.1.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.2.0,1.2.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.1.0,3.1.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.1,3.0.0
-      ,,,
-      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.1.40093,6.1.40093,6.1.32830
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.12.0,0.12.0,0.11.0
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.5,6.1.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.5.08,20240125.5.08,20231016.2.245
-      ,,,
-      TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,24.5.1,24.5.1,23.4.2
-      :doc:`HIPIFY <hipify:index>`,17.0.0.24193,17.0.0.24193,17.0.0.23483
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0,0.71.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60105,2.0.60102,2.0.60000
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,0.3.0,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60105,4.1.60102,4.1.60000
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.1.0,14.1.0,13.2.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.2.0,7.2.0,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.5,rocm-6.1.2,rocm-6.0.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3,2.0.3
-      ,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix:,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,17.0.0.24193,17.0.0.24193,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,17.0.0.24193,17.0.0.24193,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24193,17.0.0.24193,17.0.0.23483
-      ,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.1.40093,6.1.40093,6.1.32830
-      :doc:`HIP <hip:index>`,6.1.40093,6.1.40093,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.13.0,1.12.0
+      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
+      ,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,CentOS 7.9,CentOS 7.9
+      ,,
+      :doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3
+      ,CDNA2,CDNA2
+      ,CDNA,CDNA
+      ,RDNA3,RDNA3
+      ,RDNA2,RDNA2
+      ,,
+      :doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100
+      ,gfx1030,gfx1030
+      ,gfx942 [#]_, gfx942 [#]_
+      ,gfx90a,gfx90a
+      ,gfx908,gfx908
+      ,,
+      ECOSYSTEM SUPPORT:,,
+      :doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`Tensorflow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.15, 2.14, 2.13","2.14, 2.13, 2.12"
+      :doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26
+      `ONNX-RT <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.14.1
+      ,,
+      3RD PARTY COMMUNICATION LIBS:,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1
+      ,,
+      3RD PARTY ALGORITHM LIBS:,,
+      Thrust,2.1.0,2.0.1
+      CUB,2.1.0,2.0.1
+      ,,
+      ML & COMPUTER VISION LIBS:,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.1.0,3.0.0
+      :doc:`MIVisionX <mivisionx:doxygen/html/index>`,2.5.0,2.5.0
+      :doc:`rocDecode <rocdecode:index>`,0.5.0,N/A
+      :doc:`RPP <rpp:index>`,1.5.0,1.4.0
+      ,,
+      COMMUNICATION:,,
+      :doc:`rccl <rccl:index>`,2.18.6,2.18.3
+      ,,
+      MATH LIBS:,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.1.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.13
+      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.1.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.1.1,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.1.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.27,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.0.1,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.4.0,1.3.0
+      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.39.0
+      ,,
+      PRIMITIVES:,,
+      :doc:`hipCUB <hipcub:index>`,3.1.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.2.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.1.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.0
+      ,,
+      SUPPORT LIBS:,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.1.40091,6.0.32830
+      `rocm-cmake <https://github.com/ROCm/rocm-cmake>`_,0.12.0,0.11.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.0,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.3.30,20231016.2.245
+      ,,
+      TOOLS:,,
+      :doc:`AMD SMI <amdsmi:index>`,24.4.1,23.4.2
+      :doc:`HIPIFY <hipify:index>`,17.0.0,17.0.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0
+      `ROCdebug-Agent <https://github.com/ROCm/rocr_debug_agent>`_,2.0.3,2.0.3
+      :doc:`rocGDB <rocgdb:index>`,14.1.0,13.2.0
+      :doc:`rocProfiler <rocprofiler:profiler_home_page>`,2.0.60100,2.0.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,N/A
+      :doc:`rocTracer <roctracer:index>`,4.1.60100,4.1.0
+      `rocm_bandwidth_test <https://github.com/ROCm/rocm_bandwidth_test>`_,1.4.0,1.4.0
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0
+      `rocminfo <https://github.com/ROCm/rocminfo>`_,1.0.0,1.0.0
+      :doc:`ROCm SMI Lib <rocm_smi_lib:index>`,7.0.0,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.0,rocm-6.0.0
+      :doc:`TransferBench <transferbench:index>`,1.48,1.46
+      ,,
+      COMPILERS:,,
+      `AOMP <https://github.com/ROCm/aomp>`_,17.60.0,17.60.0
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0
+      `Flang <https://github.com/ROCm/flang>`_,17.0.0.24103,17.0.0.23483
+      `llvm-project <https://github.com/ROCm/llvm-project>`_,17.0.0.24103,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24103,17.0.0.23483
+      ,,
+      RUNTIMES:,,
+      :doc:`HIP <hip:index>`,6.1.40091,6.0.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0
+      `ROCR Runtime <https://github.com/ROCm/ROCR-Runtime>`_,1.13.0,1.12.0
+

 .. rubric:: Footnotes
+.. [#] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.3 & 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+.. [#] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.

-.. [#Ubuntu220405] Preview support of Ubuntu 22.04.5 only.
-.. [#red-hat94] **For ROCm 6.1** - RHEL 9.4 is supported only on AMD Instinct MI300A.
-.. [#oracle89] **For ROCm 6.1.1** - Oracle Linux is supported only on AMD Instinct MI300X.
-.. [#mi300_612] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
-.. [#mi300_600] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.

--- a/docs/compatibility/precision-support.rst
+++ b/docs/compatibility/precision-support.rst
@@ -416,7 +416,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
        - -/✅
        - -/✅
        - -/✅
@@ -428,7 +428,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
@@ -474,7 +474,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
        - -/❌
        - -/❌
        - -/✅
@@ -492,7 +492,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -33,8 +33,8 @@ Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
 subdivided into four SIMD units that process SIMD instructions of 16 data
 elements per instruction (for the FP64 data type). This enables the CU to
 process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
-GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
-TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
+GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
+TFLOPS for vector instructions. The MI250 compute units also provide specialized
 execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -10,7 +10,7 @@ GPU computational elements of the processor along with the lower levels of the c

 The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.

-```{figure} ../../data/shared/xcd-sys-arch.png
+```{figure} ../../data/conceptual/gpu-arch/image007.png
 ---
 name: mi300-xcd
 align: center
@@ -103,7 +103,7 @@ MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, w

 ## Node-level architecture

-```{figure} ../../data/shared/mi300-node-level-arch.png
+```{figure} ../../data/conceptual/gpu-arch/image009.png
 ---
 name: mi300-node

--- a/docs/conceptual/gpu-memory.md
+++ b/docs/conceptual/gpu-memory.md
@@ -51,7 +51,7 @@ In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault
 There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
 On MI200 these options do not impact performance.
 <!-- TODO: link to programming_manual#memory-allocation-flags -->
-For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:how-to/programming_manual`.
+For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
 :::

 Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.
--- a/docs/conceptual/setting-cus.rst
+++ b/docs/conceptual/setting-cus.rst
@@ -0,0 +1,47 @@
+.. meta::
+    :description: Setting the number of CUs
+    :keywords: AMD, ROCm, cu, number of cus
+
+.. _env-variables-reference:
+
+*************************************************************
+Setting the number of CUs
+*************************************************************
+
+When using GPUs to accelerate compute workloads, it sometimes becomes necessary
+to configure the hardware's usage of Compute Units (CU). This is a more advanced
+option, so please read this page before experimentation.
+
+The GPU driver provides two environment variables to set the number of CUs used. The
+first one is ``HSA_CU_MASK`` and the second one is ``ROC_GLOBAL_CU_MASK``. The main
+difference is that ``ROC_GLOBAL_CU_MASK`` sets the CU mask on queues created by the HIP
+or the OpenCL runtimes. While ``HSA_CU_MASK`` sets the mask on a lower level of queue
+creation in the driver, this mask will also be set for queues being profiled.
+
+The environment variables have the following syntax:
+
+::
+
+    ID = [0-9][0-9]*                         ex. base 10 numbers
+    ID_list = (ID | ID-ID)[, (ID | ID-ID)]*  ex. 0,2-4,7
+    GPU_list = ID_list                       ex. 0,2-4,7
+    CU_list = 0x[0-F]* | ID_list             ex. 0x337F OR 0,2-4,7
+    CU_Set = GPU_list : CU_list              ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
+    HSA_CU_MASK = CU_Set [; CU_Set]*         ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
+
+The GPU indices are taken post ``ROCR_VISIBLE_DEVICES`` reordering. For GPUs listed,
+the listed or masked CUs will be enabled, the rest disabled. Unlisted GPUs will not
+be affected, their CUs will all be enabled.
+
+The parsing of the variable is stopped when a syntax error occurs. The erroneous set
+and the ones following will be ignored. Repeating GPU or CU IDs are a syntax error.
+Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error. For excluding
+GPU devices use ``ROCR_VISIBLE_DEVICES``.
+
+These environment variables only affect ROCm software, not graphics applications.
+
+It's important to know that not all CU configurations are valid on all devices. For
+instance, on devices where two CUs can be combined into a WGP (for kernels running in
+WGP mode), it is not valid to disable only a single CU in a WGP. `This paper
+<https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ can provide more information
+about what to expect, when disabling CUs.
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -13,9 +13,7 @@ This document provides documentation on using ROCm ASan.

 For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).

-:::{note}
-The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
-:::
+**Note:** The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.

 ## Compiling for ASan

@@ -36,13 +34,9 @@ Recommendations for doing this are:

 Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.

-:::{tip}
-It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
-:::
+**Note:** It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.

-:::{note}
-When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
-:::
+**Note:** When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.

 ### About compilation time

@@ -98,23 +92,15 @@ If it does not appear, when executed the application will quickly output an ASan

 There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.

-There are three `ASAN_OPTION` flags of note.
+There are two `ASAN_OPTION` flags of particular note.

 * `halt_on_error=0/1 default 1`.

-  This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
+This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.

 * `detect_leaks=0/1 default 1`.

-  This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
-
-* `quarantine_size_mb=N default 256`
-
-  This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
-
-  :::{note}
-  Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
-  :::
+This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.

 ## Runtime overhead

@@ -200,7 +186,7 @@ or

 currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.

-## Running ASan with `rocgdb`
+### Running with `rocgdb`

 `rocgdb` can be used to further investigate ASan detected errors, with some preparation.

@@ -252,7 +238,7 @@ $ rocgdb <path to application>
 (gdb) c
 ```

-## Using ASan with a short HIP application
+### Using ASan with a short HIP application

 Consider the following simple and short demo of using the Address Sanitizer with a HIP application:

@@ -416,7 +402,7 @@ Shadow byte legend (one shadow byte represents 8 application bytes):
 ==2817==ABORTING
 ```

-## Known issues with using GPU sanitizer
+### Known issues with using GPU sanitizer

 * Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.

@@ -424,8 +410,4 @@ Shadow byte legend (one shadow byte represents 8 application bytes):

 * Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.

-* It can also be the case that a memory fault is reported for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
-
-* There is currently a bug which can result in memory faults being reported when running instrumented device code which makes use of `malloc`, `free`, `new`, or `delete`.
-
-* There is currently a bug which can result in undefined symbols being reported at compile time when instrumented device code makes use of `new` and `delete`.
+* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,9 +5,25 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

 import shutil
+import jinja2
+import os

+# Environment to process Jinja templates.
+jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
+
+# Jinja templates to render out.
+templates = []
+
+# Render templates and output files without the last extension.
+# For example: 'install.md.jinja' becomes 'install.md'.
+for template in templates:
+    rendered = jinja_env.get_template(template).render()
+    with open(os.path.splitext(template)[0], 'w') as file:
+        file.write(rendered)
+
+shutil.copy2('../RELEASE.md','./about/release-notes.md')
 # Keep capitalization due to similar linking on GitHub's markdown preview.
-shutil.copy2("../RELEASE.md", "./about/release-notes.md")
+shutil.copy2('../CHANGELOG.md','./about/changelog.md')

 latex_engine = "xelatex"
 latex_elements = {
@@ -21,71 +37,58 @@ latex_elements = {
 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.1.5"
-release = "6.1.5"
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = "6.1.2"
+release = "6.1.2"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-03-04"},
-    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
-    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/train-a-model", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/deploy-your-model", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/hugging-face-models", "os": ["linux"]},
-    {"file": "how-to/rocm-for-hpc/index", "os": ["linux"]},
-    {"file": "how-to/llm-fine-tuning-optimization/index", "os": ["linux"]},
-    {"file": "how-to/llm-fine-tuning-optimization/overview", "os": ["linux"]},
    {
-        "file": "how-to/llm-fine-tuning-optimization/fine-tuning-and-inference",
-        "os": ["linux"],
+        "file":"about/release-notes",
+        "os":["linux", "windows"],
+        "date":"2024-06-04"
    },
    {
-        "file": "how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference",
-        "os": ["linux"],
+        "file":"about/changelog",
+        "os":["linux", "windows"],
+        "date":"2024-06-04"
    },
-    {
-        "file": "how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference",
-        "os": ["linux"],
-    },
-    {
-        "file": "how-to/llm-fine-tuning-optimization/llm-inference-frameworks",
-        "os": ["linux"],
-    },
-    {
-        "file": "how-to/llm-fine-tuning-optimization/model-acceleration-libraries",
-        "os": ["linux"],
-    },
-    {"file": "how-to/llm-fine-tuning-optimization/model-quantization", "os": ["linux"]},
-    {
-        "file": "how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel",
-        "os": ["linux"],
-    },
-    {
-        "file": "how-to/llm-fine-tuning-optimization/optimizing-triton-kernel",
-        "os": ["linux"],
-    },
-    {
-        "file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
-        "os": ["linux"],
-    },
-    {"file": "how-to/system-optimization/index", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
-    {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
-    {"file": "how-to/system-debugging", "os": ["linux"]},
-    {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
+
+    {"file":"install/windows/install-quick", "os":["windows"]},
+    {"file":"install/linux/install-quick", "os":["linux"]},
+
+    {"file":"install/linux/install", "os":["linux"]},
+    {"file":"install/linux/install-options", "os":["linux"]},
+    {"file":"install/linux/prerequisites", "os":["linux"]},
+
+    {"file":"install/docker", "os":["linux"]},
+    {"file":"install/magma-install", "os":["linux"]},
+    {"file":"install/pytorch-install", "os":["linux"]},
+    {"file":"install/tensorflow-install", "os":["linux"]},
+
+    {"file":"install/windows/install", "os":["windows"]},
+    {"file":"install/windows/prerequisites", "os":["windows"]},
+    {"file":"install/windows/cli/index", "os":["windows"]},
+    {"file":"install/windows/gui/index", "os":["windows"]},
+
+    {"file":"about/compatibility/docker-image-support-matrix", "os":["linux"]},
+    {"file":"about/compatibility/user-kernel-space-compat-matrix", "os":["linux"]},
+
+    {"file":"reference/library-index", "os":["linux"]},
+
+    {"file":"how-to/deep-learning-rocm", "os":["linux"]},
+    {"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
+    {"file":"how-to/system-debugging", "os":["linux"]},
+    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},
+
+    {"file":"rocm-a-z", "os":["linux", "windows"]},
 ]

+exclude_patterns = ['temp']
+
 external_toc_path = "./sphinx/_toc.yml"

 extensions = ["rocm_docs", "sphinx_reredirects"]
@@ -96,12 +99,14 @@ html_theme = "rocm_docs_theme"
 html_theme_options = {"flavor": "rocm-docs-home"}

 html_static_path = ["sphinx/static/css"]
-html_css_files = ["rocm_custom.css", "rocm_rn.css"]
+html_css_files = ["rocm_custom.css"]

 html_title = "ROCm Documentation"

-html_theme_options = {"link_main_doc": False}
+html_theme_options = {
+    "link_main_doc": False
+}

-redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
-
-numfig = False
+redirects = {
+     "reference/openmp/openmp": "../../about/compatibility/openmp.html"
+}
--- a/docs/contribute/feedback.md
+++ b/docs/contribute/feedback.md
@@ -12,7 +12,8 @@ There are four standard ways to provide feedback on this repository.

 All contributions to ROCm documentation should arrive via the
 [GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
-targeting the develop branch of the repository.
+targeting the develop branch of the repository. If you are unable to contribute
+via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).

 For more in-depth information on creating a pull request (PR), see
 [Contributing](./contributing.md).
@@ -29,3 +30,7 @@ and follow along on via public announcements.

 Issues on existing or absent documentation can be filed in
 [GitHub Issues](https://github.com/ROCm/ROCm/issues).
+
+## Email
+
+Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
--- a/docs/data/conceptual/gpu-arch/image009.png
+++ b/docs/data/conceptual/gpu-arch/image009.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
--- a/docs/data/how-to/fine-tuning-llms/compute-unit.png
+++ b/docs/data/how-to/fine-tuning-llms/compute-unit.png
--- a/docs/data/how-to/fine-tuning-llms/occupancy-vgpr.png
+++ b/docs/data/how-to/fine-tuning-llms/occupancy-vgpr.png
--- a/docs/data/how-to/fine-tuning-llms/omniperf-analysis.png
+++ b/docs/data/how-to/fine-tuning-llms/omniperf-analysis.png
--- a/docs/data/how-to/fine-tuning-llms/omnitrace-timeline.png
+++ b/docs/data/how-to/fine-tuning-llms/omnitrace-timeline.png
--- a/docs/data/how-to/fine-tuning-llms/perfetto-trace.svg
+++ b/docs/data/how-to/fine-tuning-llms/perfetto-trace.svg
--- a/docs/data/how-to/fine-tuning-llms/profiling-perfetto-ui.png
+++ b/docs/data/how-to/fine-tuning-llms/profiling-perfetto-ui.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
--- a/docs/data/how-to/framework_install_2024_05_23.png
+++ b/docs/data/how-to/framework_install_2024_05_23.png
--- a/docs/data/how-to/framework_install_2024_07_04.png
+++ b/docs/data/how-to/framework_install_2024_07_04.png
--- a/docs/data/how-to/rocm-for-hpc/hpc-stack-2024_6_20.png
+++ b/docs/data/how-to/rocm-for-hpc/hpc-stack-2024_6_20.png
--- a/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
--- a/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
+++ b/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
--- a/docs/data/how-to/tuning-guides/rocminfo.png
+++ b/docs/data/how-to/tuning-guides/rocminfo.png
--- a/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
--- a/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
--- a/docs/data/shared/xcd-sys-arch.png
+++ b/docs/data/shared/xcd-sys-arch.png
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -8,18 +8,48 @@ Installing deep learning frameworks for ROCm

 ROCm provides a comprehensive ecosystem for deep learning development, including
 :ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
-deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
+deep learning frameworks and libraries such as PyTorch, TensorFlow, JAX, and MAGMA. ROCm works closely with these
 frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.

 The following guides cover installation processes for ROCm-aware deep learning frameworks.

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
-* :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+.. grid::
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`MAGMA for ROCm <rocm-install-on-linux:how-to/3rd-party/magma-install>`
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3

 The following chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

-.. image:: ../data/how-to/framework_install_2024_07_04.png
+.. image:: ../data/how-to/framework_install_2024_05_23.png
   :alt: Flowchart for installing ROCm-aware machine learning frameworks
   :align: center

@@ -35,4 +65,4 @@ through the following guides.

 * :doc:`rocm-for-ai/index`

-* :doc:`llm-fine-tuning-optimization/index`
+* :doc:`fine-tuning-llms/index`
--- a/docs/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.rst
--- a/docs/how-to/llm-fine-tuning-optimization/index.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/index.rst
--- a/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
@@ -28,9 +28,18 @@ graphs, tensor parallel multi-GPU, GPTQ, AWQ, and token speculation.
 Installing vLLM
 ---------------

+1. To install vLLM, run the following commands.
+
+   .. code-block:: shell
+
+      # Install from source
+      git clone https://github.com/ROCm/vllm.git    
+      cd vllm
+      PYTORCH_ROCM_ARCH=gfx942 python setup.py install #MI300 series
+
 .. _fine-tuning-llms-vllm-rocm-docker-image:

-1. Run the following commands to build a Docker image ``vllm-rocm``.
+2. Run the following commands to build a Docker image ``vllm-rocm``.

   .. code-block:: shell

@@ -43,7 +52,7 @@ Installing vLLM
   .. tab-item:: vLLM on a single-accelerator system
      :sync: single

-      2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
+      3. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
         Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.

         .. code-block:: shell
@@ -60,7 +69,7 @@ Installing vLLM
               vllm-rocm \
               bash

-      3. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.
+      4. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.

         .. code-block:: shell

@@ -68,61 +77,10 @@ Installing vLLM

         The following log message is displayed in your command line indicates that the server is listening for requests.

-         .. image:: ../../data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
+         .. image:: ../../data/how-to/fine-tuning-llms/vllm-single-gpu-log.png
            :alt: vLLM API server log message
            :align: center

-      4. To test, send it a curl request containing a prompt.
-
-         .. code-block:: shell
-
-            curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
-
-         You should receive a response like the following.
-
-         .. code-block:: text
-
-            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
-
-   .. tab-item:: vLLM on a multi-accelerator system
-      :sync: multi
-
-      2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
-         Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
-
-         .. code-block:: shell
-
-            docker run -it \
-               --network=host \
-               --group-add=video \
-               --ipc=host \
-               --cap-add=SYS_PTRACE \
-               --security-opt seccomp=unconfined \
-               --device /dev/kfd \
-               --device /dev/dri \
-               -v <path/to/model>:/app/model \
-               vllm-rocm \
-               bash
-
-
-      3. To run API server on multiple GPUs, use the ``-tp``  or ``--tensor-parallel-size``  parameter. For example, to use two
-         GPUs, start the API server using the following command.
-
-         .. code-block:: shell
-
-            python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &
-
-      4. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
-         isolate each instance to a different accelerator.
-
-         For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
-         a command like the following.
-
-         .. code-block:: shell
-
-            ROCR_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2 --port 8000 &
-            ROCR_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2--port 8001 &
-
      5. To test, send it a curl request containing a prompt.

         .. code-block:: shell
@@ -134,8 +92,57 @@ Installing vLLM
         .. code-block:: text

            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
+            
+   .. tab-item:: vLLM on a multi-accelerator system
+      :sync: multi

-Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.
+      3. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
+         Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
+
+         .. code-block:: shell
+
+            docker run -it \
+               --network=host \
+               --group-add=video \
+               --ipc=host \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --device /dev/kfd \
+               --device /dev/dri \
+               -v <path/to/model>:/app/model \
+               vllm-rocm \
+               bash
+
+
+      4. To run API server on multiple GPUs, use the ``-tp``  or ``--tensor-parallel-size``  parameter. For example, to use two
+         GPUs, start the API server using the following command.
+
+         .. code-block:: shell
+
+            python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &
+
+      5. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
+         isolate each instance to a different accelerator.
+
+         For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
+         a command like the following.
+
+         .. code-block:: shell
+
+            ROCR_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2 --port 8000 &
+            ROCR_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2--port 8001 &
+
+      6. To test, send it a curl request containing a prompt.
+
+         .. code-block:: shell
+
+            curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
+
+         You should receive a response like the following.
+
+         .. code-block:: text
+
+            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}

 .. _fine-tuning-llms-tgi:

@@ -156,29 +163,27 @@ speculation.
 Install TGI
 -----------

-1. Launch the TGI Docker container in the host machine.
+1. To install the TGI Docker image, run the following commands.

   .. code-block:: shell

-      docker run --name tgi --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
-      --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 256g
-      --net host -v $PWD:/data
-      --entrypoint "/bin/bash"
-      --env HUGGINGFACE_HUB_CACHE=/data
-      ghcr.io/huggingface/text-generation-inference:latest-rocm
+      # Install from Dockerfile
+      git clone https://github.com/huggingface/text-generation-inference.git -b mi300-compat    
+      cd text-generation-inference
+      docker build . -f Dockerfile.rocm

 .. tab-set::

   .. tab-item:: TGI on a single-accelerator system
      :sync: single

-      2. Inside the container, launch a model using TGI server on a single accelerator.
+      2. Launch a model using TGI server on a single accelerator.

         .. code-block:: shell

            export ROCM_USE_FLASH_ATTN_V2_TRITON=True
            text-generation-launcher --model-id NousResearch/Meta-Llama-3-70B --dtype float16 --port 8000 &
-
+      
      3. To test, send it a curl request containing a prompt.

         .. code-block:: shell
@@ -186,26 +191,26 @@ Install TGI
            curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'

         You should receive a response like the following.
-
+      
         .. code-block:: shell

            data:{"index":20,"token":{"id":304,"text":" in","logprob":-1.2822266,"special":false},"generated_text":" AMD Instinct is a new family of data center GPUs designed to accelerate the most demanding workloads in","details":null}

   .. tab-item:: TGI on a multi-accelerator system

-      2. Inside the container, launch a model using TGI server on multiple accelerators (4 in this case).
+      2. Launch a model using TGI server on multiple accelerators (4 in this case).

         .. code-block:: shell

            export ROCM_USE_FLASH_ATTN_V2_TRITON=True
            text-generation-launcher --model-id NousResearch/Meta-Llama-3-8B --dtype float16 --port 8000 --num-shard 4 &
-
+      
      3. To test, send it a curl request containing a prompt.

         .. code-block:: shell

            curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'
-
+      
         You should receive a response like the following.

         .. code-block:: shell
--- a/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
@@ -8,8 +8,6 @@ Model acceleration libraries

 This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.

-.. _acceleration-flash-attention:
-
 Flash Attention 2
 =================

@@ -20,7 +18,7 @@ Attention (GQA), and Multi-Query Attention (MQA). This reduction in memory movem
 time-to-first-token (TTFT) latency for large batch sizes and long prompt sequences, thereby enhancing overall
 performance.

-.. image:: ../../data/how-to/llm-fine-tuning-optimization/attention-module.png
+.. image:: ../../data/how-to/fine-tuning-llms/attention-module.png
   :alt: Attention module of a large language module utilizing tiling
   :align: center

@@ -245,7 +243,7 @@ page describes the options.
   Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
   GemmTunableOp_float_TN,tn_200_100_20,Gemm_Rocblas_32323,0.00669595

-.. image:: ../../data/how-to/llm-fine-tuning-optimization/tunableop.png
+.. image:: ../../data/how-to/fine-tuning-llms/tunableop.png
   :alt: GEMM and TunableOp
   :align: center

--- a/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
@@ -154,12 +154,11 @@ kernels by configuring the ``exllama_config`` parameter as the following.
 .. code-block:: python

   from transformers import AutoModelForCausalLM, GPTQConfig
-   #pretrained_model_dir = "meta-llama/Llama-2-7b"
-   base_model_name = "NousResearch/Llama-2-7b-hf"
-   gptq_config = GPTQConfig(bits=4, dataset="c4", exllama_config={"version":2})
+   pretrained_model_dir = "meta-llama/Llama-2-7b"
+   gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
   quantized_model = AutoModelForCausalLM.from_pretrained(
-                           base_model_name,
-                           device_map="auto",
+                           base_model_name, 
+                           device_map="auto", 
                           quantization_config=gptq_config)

 bitsandbytes
--- a/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
--- a/docs/how-to/fine-tuning-llms/optimizing-triton-kernel.rst
+++ b/docs/how-to/fine-tuning-llms/optimizing-triton-kernel.rst
@@ -0,0 +1,388 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, Triton, kernel, performance, optimization
+
+*************************
+Optimizing Triton kernels
+*************************
+
+This section introduces the general steps for `Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
+Triton kernel optimization is similar to HIP and CUDA kernel optimization.
+
+.. _fine-tuning-llms-triton-memory-access-efficiency:
+
+Memory access efficiency
+========================
+
+The accelerator or GPU contains global memory, local data share (LDS), and registers. Global memory has high access
+latency, but is large. LDS access has much lower latency, but is smaller. Register access is the fastest yet smallest
+among the three.
+
+So, the data in global memory should be loaded and stored as few times as possible. If different threads in a block
+need to access the same data, these data should be first transferred from global memory to LDS, then accessed by
+different threads in a workgroup.
+
+.. _fine-tuning-llms-triton-hardware-resource-utilization:
+
+Hardware resource utilization
+=============================
+
+Each accelerator or GPU has multiple Compute Units (CUs) and various CUs do computation in parallel. So, how many CUs
+can a compute kernel can allocate its task to? For the :doc:`AMD MI300X accelerator <../../reference/gpu-arch-specs>`, the
+grid should have at least 1024 thread blocks or workgroups.
+
+.. figure:: ../../data/how-to/fine-tuning-llms/compute-unit.png
+
+   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.
+
+To increase hardware utilization and maximize parallelism, it is necessary to design algorithms that can exploit more
+parallelism. One approach to achieving this is by using larger split-K techniques for General Matrix Multiply (GEMM)
+operations, which can further distribute the computation across more CUs, thereby enhancing performance.
+
+.. tip::
+
+   You can query hardware resources with the command ``rocminfo`` (in the ``/opt/rocm/bin`` directory). For instance,
+   query the number of CUs, number of SIMD, and wavefront size using the following commands.
+
+   .. code-block:: shell
+
+      rocminfo | grep "Compute Unit"
+
+      rocminfo | grep "SIMD"
+
+      rocminfo | grep "Wavefront Size"
+
+   On an MI300X device, there are 304 CUs, 4 SIMD per CU, and the wavefront size (warp size) is 64. See :doc:`Hardware
+   specifications <../../reference/gpu-arch-specs>` for a full list of AMD accelerators and GPUs.
+
+.. _fine-tuning-llms-triton-ir-analysis:
+
+IR analysis
+===========
+
+In Triton, there are several layouts including *blocked*, *shared*, *sliced*, and *MFMA*.
+
+From the Triton GPU IR (intermediate representation), you can know in which memory each computation is
+performed. The following is a snippet of IR from the Flash Attention decode ``int4`` key-value program. It is to
+de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
+
+.. code-block::
+
+   %190 = tt.load %189 {cache = 1 : i32, evict = 1 : i32, isVolatile =
+   false} : tensor<1x64xi32, #blocked6> loc(#loc159)
+
+   %266 = arith.andi %190, %cst_28 : tensor<1x64xi32, #blocked6>
+   loc(#loc250)
+
+   %267 = arith.trunci %266 : tensor<1x64xi32, #blocked6> to
+   tensor<1x64xi16, #blocked6> loc(#loc251)
+
+   %268 = tt.bitcast %267 : tensor<1x64xi16, #blocked6> -> tensor<1x64xf16,
+   #blocked6> loc(#loc252)
+
+   %269 = triton_gpu.convert_layout %268 : (tensor<1x64xf16, #blocked6>) ->
+   tensor<1x64xf16, #shared1> loc(#loc252)
+
+   %270 = tt.trans %269 : (tensor<1x64xf16, #shared1>) -> tensor<64x1xf16,
+   #shared2> loc(#loc194)
+
+   %276 = triton_gpu.convert_layout %270 : (tensor<64x1xf16, #shared2>) ->
+   tensor<64x1xf16, #blocked5> loc(#loc254)
+
+   %293 = arith.mulf %276, %cst_30 : tensor<64x1xf16, #blocked5>
+   loc(#loc254)
+
+   %295 = arith.mulf %292, %294 : tensor<64x32xf16, #blocked5> loc(#loc264)
+
+   %297 = arith.addf %295, %296 : tensor<64x32xf16, #blocked5> loc(#loc255)
+
+   %298 = triton_gpu.convert_layout %297 : (tensor<64x32xf16, #blocked5>)
+   -> tensor<64x32xf16, #shared1> loc(#loc255)
+
+   %299 = tt.trans %298 : (tensor<64x32xf16, #shared1>) ->
+   tensor<32x64xf16, #shared2> loc(#loc196)
+
+   %300 = triton_gpu.convert_layout %299 : (tensor<32x64xf16, #shared2>) ->
+   tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth
+   = 4}>> loc(#loc197)
+
+From the IR, you can see ``i32`` data is loaded from global memory to registers. With a few element-wise operations in
+registers, then it is stored in shared memory for the transpose operation, which needs data movement across different
+threads. With the transpose done, it is loaded from LDS to register again, and with a few more element-wise operations,
+they are stored in LDS again. The last step is to load from LDS to registers and convert to the dot-operand layout.
+
+From the IR, you can see that it uses the LDS twice: one for the transpose, and the other to convert the blocked layout
+to a dot-operand layout.
+
+Assembly analysis
+=================
+
+In the ISA, ensure ``global_load_dwordx4`` is used, especially when the
+load happens in a loop.
+
+In most cases, the LDS load and store should use ``_b128`` as well to
+minimize the number of LDS access instructions. Note that upstream (or backend) might not have ``_b128`` LDS read/write,
+so it uses ``_b64``. For most cases, no matter if you use fork or upstream,
+the LDS access should have ``_b64`` vector width.
+
+The AMD ISA has the ``s_waitcnt`` instruction to synchronize the dependency
+of memory access and computations. The ``s_waitcnt`` instruction can
+have two signals, typically in the context of Triton:
+
+* ``lgkmcnt(n):`` `lgkm` stands for LDS, GDS, Constant and Message.
+
+  In this context, it is often related to LDS access. The number ``n`` here means the number of such accesses that can
+  be left out to continue. For example, 0 means all ``lgkm`` access must finish before continuing, and 1 means only 1
+  ``lgkm`` access can be still running asynchronously before proceeding.
+
+* ``vmcnt(n):`` `vm` means vector memory.
+
+  This happens when vector memory is accessed, for example, when global load moves from global memory to vector memory.
+  Again, the number ``n`` here means the number of accesses that can be left out to continue.
+
+Generally recommended guidelines are as follows.
+
+*  Vectorize memory access as much as possible.
+
+*  Ensure synchronization is done efficiently.
+
+*  Overlap of instructions to hide latency, but it requires thoughtful
+   analysis of the algorithms.
+
+*  If you find inefficiencies, you can trace it back to LLVM IR, TTGIR
+   and even TTIR to see where the problem comes from. If you find it
+   during compiler optimization, activate the MLIR dump and check which
+   optimization pass caused the problem.
+
+.. _fine-tuning-llms-triton-kernel-occupancy:
+
+Kernel occupancy
+================
+
+1. Get the VGPR count, search for ``.vgpr_count`` in the ISA (for example, ``N``).
+
+2. Get the allocated LDS following the steps (for example, L for the kernel).
+
+   a. ``export MLIR_ENABLE_DUMP=1``
+
+   b. ``rm -rf ~/.triton/cache``
+
+   c. ``python kernel.py | | grep "triton_gpu.shared = " | tail -n 1``
+
+   d. You should see something like ``triton_gpu.shared = 65536``, indicating 65536 bytes of LDS are allocated for the
+      kernel.
+
+3. Get number of waves per workgroup using the following steps (for example, ``nW``).
+
+   a. ``export MLIR_ENABLE_DUMP=1``
+
+   b. ``rm -rf ~/.triton/cache``
+
+   c. ``python kernel.py | | grep "triton_gpu.num-warps " | tail -n 1``
+
+   d. You should see something like ``“triton_gpu.num-warps" = 8``, indicating 8 waves per workgroup.
+
+4. Compute occupancy limited by VGPR based on N according to the following table. For example, waves per EU as
+   ``occ_vgpr``.
+
+.. _fine-tuning-llms-occupancy-vgpr-table:
+
+.. figure:: ../../data/how-to/fine-tuning-llms/occupancy-vgpr.png
+   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
+   :align: center
+
+5. Compute occupancy limited by LDS based on L by: ``occ_lds = floor(65536 / L)``.
+
+6. Then the occupancy is ``occ = min(floor(occ_vgpr * 4 / nW), occ_lds) * nW / 4``
+
+   a. ``occ_vgpr \* 4`` gives the total number of waves on all 4 execution units (SIMDs)
+      per CU.
+
+   b. ``floor(occ_vgpr * 4 / nW)`` gives the occupancy of workgroups per CU
+      regrading VGPR usage.
+
+   c. The true ``occ`` is the minimum of the two.
+
+.. _fine-tuning-llms-triton-kernel-configs-env-vars:
+
+Auto-tunable kernel configurations and environment variables
+============================================================
+
+This section relates to the amount of :ref:`memory access <fine-tuning-llms-triton-memory-access-efficiency>` and
+computation assigned to each CU. It is related to the usage of LDS, registers and the scheduling of different tasks on
+a CU.
+
+The following is a list of kernel arguments used for tuning.
+
+``num_stages=n``
+   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
+   according to the following rules:
+
+   * For kernels with a single GEMM, set to ``0``.
+
+   * For kernels with two GEMMs fused (Flash Attention, or any other kernel
+     that fuses 2 GEMMs), set to ``1``.
+
+   * For kernels that fuse a single GEMM with another non-GEMM operator
+     (for example ReLU activation), set to ``0``.
+
+   * For kernels that have no GEMMs, set to ``1``.
+
+``waves_per_eu=n``
+   Helps to manage Vector General Purpose Registers (VGPR) usage to achieve desired occupancy levels. This argument
+   hints to the compiler to reduce VGPR to achieve ``n`` occupancy. See
+   :ref:`Kernel occupancy <fine-tuning-llms-triton-kernel-occupancy>` for more information about how to compute
+   occupancy. 
+
+   This argument is useful if:
+
+   * The occupancy of the kernel is limited by VGPR usage.
+
+   * The current VGPR usage is only a few above a boundary in
+     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <fine-tuning-llms-occupancy-vgpr-table>`.
+
+   For example, according to the table, the available VGPR is 512 per Execution Unit (EU), and VGPU is allocated at the
+   unit of 16. If the current VGPR usage is 170, the actual requested VGPR will be 176, so the
+   occupancy is only 2 waves per CU since :math:`176 \times 3 > 512`. So, if you set
+   ``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
+   that it might fit 3 waves per EU.
+
+``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
+   Tile sizes to be tuned to balance the memory-to-computation ratio. You want tile sizes large enough to
+   maximize the efficiency of memory-to-computation ratio, but small enough to parallelize the greatest number of
+   workgroups at the grid level.
+
+``matrix_instr_nonkdim``
+   Experimental feature for Flash Attention-like kernels that determines the size of the Matrix Fused Multiply-Add
+   (MFMA) instruction used.
+
+   -  ``Matrix_instr_nonkdim = 16``: ``mfma_16x16`` is used.
+
+   -  ``Matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.
+
+   For GEMM kernels on an AMD MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
+   tile/GEMM sizes.
+
+The following is an environment variable used for tuning.
+
+``OPTIMIZE_EPILOGUE``
+   Setting this variable to ``1`` can improve performance by removing the ``convert_layout`` operation in the epilogue.
+   It should be turned on (set to ``1``) in most cases. Setting ``OPTIMIZE_EPILOGUE=1`` stores the MFMA instruction
+   results in the MFMA layout directly; this comes at the cost of reduced global store efficiency, but the impact on
+   kernel execution time is usually minimal.
+
+   By default (``0``), the results of MFMA instruction are converted to blocked layout, which leads to ``global_store``
+   with maximum vector length, that is ``global_store_dwordx4``.
+
+   This is done implicitly with LDS as the intermediate buffer to achieve
+   data exchange between threads. Padding is used in LDS to avoid bank
+   conflicts. This usually leads to extra LDS usage, which might reduce
+   occupancy.
+
+   .. note::
+
+      This variable is not turned on by default because it only
+      works with ``tt.store`` but not ``tt.atomic_add``, which is used in split-k and
+      stream-k GEMM kernels. In the future, it might be enabled with
+      ``tt.atomic_add`` and turned on by default.
+
+   See :ref:`IR analysis <fine-tuning-llms-triton-ir-analysis>`.
+
+TorchInductor with Triton tuning knobs
+===========================================
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (``conv``) operations in PyTorch
+using ``inductor``, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better
+performance.
+
+Learn more about TorchInductor environment variables and usage in
+`PyTorch documentation <https://pytorch.org/docs/2.3/torch.compiler_inductor_profiling.html>`_.
+
+To enable a ``gemm``/``conv`` lowering to Triton, it requires use of ``inductor``’s ``max_autotune`` mode. This benchmarks a
+static list of Triton configurations (``conv`` configurations for max auto-tune + ``matmul`` configurations for max
+auto-tune) and uses the fastest for each shape. Note that the Triton is not used if regular :doc:`MIOpen <miopen:index>`
+or :doc:`rocBLAS <rocblas:index>` is faster for a specific operation.
+
+* Set ``torch._inductor.config.max_autotune = True`` or ``TORCHINDUCTOR_MAX_AUTOTUNE=1``.
+
+* Or, for more fine-grained control:
+
+  ``torch._inductor.config.max_autotune.pointwise = True``
+     To enable tuning for ``pointwise``/``reduction`` ops.
+
+  ``torch._inductor.config.max_autotune_gemm = True``
+     To enable tuning or lowering of ``mm``/``conv``\s.
+
+  ``torch._inductor.max_autotune_gemm_backends/TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS``
+     To select the candidate backends for ``mm`` auto-tuning. Defaults to
+     ``TRITON,ATEN,NV``. This also includes the ``CUTLASS`` tuning option. Limiting this to
+     ``TRITON`` might improve performance by enabling more fused ``mm`` kernels
+     instead of going to rocBLAS.
+
+* For ``mm`` tuning, tuning ``coordinate_descent`` might improve performance.
+
+  ``torch._inductor.config.coordinate_descent_tuning = True`` or ``TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1``
+
+* Inference can see large improvements on AMD GPUs by utilizing
+  ``torch._inductor.config.freezing=True`` or the ``TORCHINDUCTOR_FREEZING=1`` variable, which
+  in-lines weights as constants and enables constant folding optimizations.
+
+* Enabling ``inductor``’s cpp_wrapper might improve overhead. This generates
+  C++ code which launches Triton binaries directly with
+  ``hipModuleLaunchKernel`` and relies on `hipification`.
+
+* For NHWC convolutions workloads
+  ``torch._inductor.config.layout_optimization=True`` or ``TORCHINDUCTOR_LAYOUT_OPTIMIZATION=``
+  can help be enforcing channels_last format throughout the graph avoiding
+  any additional transposes added by ``inductor``. Note that
+  ``PYTORCH_MIOPEN_SUGGEST_NHWC=1`` is recommended if using this.
+
+* Extracting the Triton kernel ``TORCH_COMPILE_DEBUG`` creates a
+  ``torch_compile_debug/`` directory at current path, in the ``output_code.py``
+  the code-strings for the Triton kernels that are defined. Manual work is
+  then required to strip out the kernel and create kernel
+  compilation and launch via Triton.
+
+* For advanced ``matmul`` or ``conv`` configuration tuning, the ``inductor-gemm-tuner`` can
+  help. This implements the Triton ``conv``/``mm`` implementations used upstream
+  and allows specification of inputs and configuration tuning search space if new
+  tunings are found that can be added to the auto-tune list.
+
+Other guidelines
+================
+
+* Performance-critical HIP provides an environment variable, ``export HIP_FORCE_DEV_KERNARG=1``,
+  that can put HIP kernel arguments directly to
+  device memory to reduce the latency of accessing kernel arguments. It
+  can reduce 2 to 3 μs for some kernels. Setting this variable for the FA
+  decode containing ``splitK`` and reduced kernels can reduce the total time
+  by around 6 μs in the benchmark test.
+
+* Set the clock to deterministic. Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed to
+  1900MHz instead of the default 2100MHz. This can reduce the chance of clock speed decrease due to chip high temperature
+  by setting a lower cap. You can restore this setting to its default value with ``rocm-smi -r``.
+
+* Set Non-Uniform Memory Access (NUMA) auto-balance. Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the
+  current setting. An output of ``0`` indicates this setting is available. If output is ``1``, run the command
+  ``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to set this.
+
+For these settings, the ``env_check.sh`` script automates the setting, resetting, and checking of the such
+environments. Find the script at `<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
+
+.. _fine-tuning-llms-triton-tunableop:
+
+TunableOp
+---------
+`TunableOp <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
+is a feature used to define and optimize kernels that can have tunable parameters. This is useful in
+optimizing the performance of custom kernels by exploring different parameter configurations to find the most efficient
+setup. See more about PyTorch TunableOp :ref:`Model acceleration libraries <fine-tuning-llms-pytorch-tunableop>`.
+
+You can easily manipulate the behavior TunableOp through environment variables, though you could use the C++ interface
+``at::cuda::tunable::getTuningContext()``. A Python interface to the ``TuningContext`` does not yet exist.
+
+The default value is ``0``, which means only 1 iteration is attempted. Remember: there’s an overhead to tuning. To try
+and minimize the overhead, only a limited number of iterations of a given operation are attempted. If you set this to
+``10``, each solution for a given operation can run as many iterations as possible within 10ms. There is a hard-coded
+upper limit of 100 iterations attempted per solution. This is a tuning parameter; if you want the tunings to be chosen
+based on an average over multiple iterations, increase the allowed tuning duration.
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
@@ -6,7 +6,7 @@

 # Optimizing with Composable Kernel

-The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
+The AMD ROCm&trade; Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.

 This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.

@@ -32,7 +32,7 @@ The template parameters of the instance are grouped into four parameter types:
 ================
 ### Figure 2
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-template_parameters.jpg
 The template parameters of the selected GEMM kernel are classified into four groups. These template parameter groups should be defined properly before running the instance.
 ```

@@ -126,7 +126,7 @@ The row and column, and stride information of input matrices are also passed to
 ================
 ### Figure 3
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-kernel_launch.jpg
 Templated kernel launching consists of kernel instantiation, making arguments by passing in actual application parameters, creating an invoker, and running the instance through the invoker.
 ```

@@ -155,7 +155,7 @@ The first operation in the process is to perform the multiplication of input mat
 ================
 ### Figure 4
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-operation_flow.jpg
 Operation flow.
 ```

@@ -171,7 +171,7 @@ Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_ke
 ================
 ### Figure 5
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-root_instance.jpg
 Use the ‘DeviceBatchedGemmMultiD_Xdl’ instance as a root.
 ```

@@ -421,7 +421,7 @@ Run `python setup.py install` to build and install the extension. It should look
 ================
 ### Figure 6
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-compilation.jpg
 Compilation and installation of the INT8 kernels.
 ```

@@ -433,7 +433,7 @@ The implementation architecture of running SmoothQuant models on MI300X GPUs is
 ================
 ### Figure 7
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-inference_flow.jpg
 The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
 ```

@@ -459,7 +459,7 @@ Figure 8 shows the performance comparisons between the original FP16 and the Smo
 ================
 ### Figure 8
 ================ -->
-```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
+```{figure} ../../data/how-to/fine-tuning-llms/ck-comparisons.jpg
 Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
 ```

--- a/docs/how-to/llm-fine-tuning-optimization/overview.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/overview.rst
@@ -41,7 +41,7 @@ The weight update is as follows: :math:`W_{updated} = W + ΔW`.
 If the weight matrix :math:`W` contains 7B parameters, then the weight update matrix :math:`ΔW` should also
 contain 7B parameters. Therefore, the :math:`ΔW` calculation is computationally and memory intensive.

-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/weight-update.png
+.. figure:: ../../data/how-to/fine-tuning-llms/weight-update.png
   :alt: Weight update diagram

   (a) Weight update in regular fine-tuning. (b) Weight update in LoRA where the product of matrix A (:math:`M\times K`)
--- a/docs/how-to/fine-tuning-llms/profiling-and-debugging.rst
+++ b/docs/how-to/fine-tuning-llms/profiling-and-debugging.rst
@@ -0,0 +1,217 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, profiling, debugging, performance, Triton
+
+***********************
+Profiling and debugging
+***********************
+
+This section discusses profiling and debugging tools and some of their common usage patterns with ROCm applications.
+
+PyTorch Profiler
+================
+
+`PyTorch Profiler <https://pytorch.org/docs/stable/profiler.html>`_ can be invoked inside Python scripts, letting you
+collect CPU and GPU performance metrics while the script is running. See the `PyTorch Profiler tutorial
+<https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ for more information.
+
+You can then visualize and view these metrics using an open-source profile visualization tool like
+`Perfetto UI <https://ui.perfetto.dev>`_.
+
+#. Use the following snippet to invoke PyTorch Profiler in your code.
+
+   .. code-block:: python
+
+      import torch
+      import torchvision.models as models
+      from torch.profiler import profile, record_function, ProfilerActivity
+      model = models.resnet18().cuda()
+      inputs = torch.randn(2000, 3, 224, 224).cuda()
+      
+      with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+          with record_function("model_inference"):
+              model(inputs)
+      prof.export_chrome_trace("resnet18_profile.json")
+
+#. Profile results in ``resnet18_profile.json`` can be viewed by the Perfetto visualization tool. Go to
+   `<https://ui.perfetto.dev>`__ and import the file. In your Perfetto visualization, you'll see that the upper section
+   shows transactions denoting the CPU activities that launch GPU kernels while the lower section shows the actual GPU
+   activities where it processes the ``resnet18`` inferences layer by layer. 
+
+   .. figure:: ../../data/how-to/fine-tuning-llms/perfetto-trace.svg
+      
+      Perfetto trace visualization example.
+
+ROCm profiling tools
+====================
+
+Heterogenous systems, where programs run on both CPUs and GPUs, introduce additional complexities. Understanding the
+critical path and kernel execution is all the more important; so, performance tuning is a necessary component in the
+benchmarking process.
+
+With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
+using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
+working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these are:
+
+* :ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>`
+* :ref:`Omniperf <fine-tuning-llms-profiling-omniperf>`
+* :ref:`Omnitrace <fine-tuning-llms-profiling-omnitrace>`
+
+.. _fine-tuning-llms-profiling-rocprof:
+
+ROCProfiler
+-----------
+:doc:`ROCProfiler <rocprofiler:index>` is primarily a low-level API for accessing and extracting GPU hardware performance
+metrics, commonly called *performance counters*. These counters quantify the performance of the underlying architecture
+showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.
+
+Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
+available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
+their execution.
+
+This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
+ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.
+
+.. note::
+
+   ``rocprof`` is a CLI-only utility so input and output takes the format of ``.txt`` and CSV files. These
+   formats provide a raw view of the data and puts the onus on the user to parse and analyze. Therefore, ``rocprof``
+   gives the user full access and control of raw performance profiling data, but requires extra effort to analyze the
+   collected data.
+
+.. _fine-tuning-llms-profiling-omniperf:
+
+Omniperf
+--------
+`Omniperf <https://rocm.github.io/omniperf>`_ is a system performance profiler for high-performance computing (HPC) and
+machine learning (ML) workloads using Instinct accelerators. Under the hood, Omniperf uses
+:ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>` to collect hardware performance counters. The Omniperf tool performs
+system profiling based on all approved hardware counters for Instinct
+accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
+block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.
+
+Omniperf takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
+to collect and analyze raw CSV output files as is the case with ROC-profiler. Instead, Omniperf automates the collection
+of all available hardware counters in one command and provides a graphical interface to help users understand and
+analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.
+
+.. note::
+
+   Omniperf collects hardware counters in multiple passes, and will therefore re-run the application during each pass
+   to collect different sets of metrics.
+
+.. figure:: ../../data/how-to/fine-tuning-llms/omniperf-analysis.png
+
+   Omniperf memory chat analysis panel.
+
+In brief, Omniperf provides details about hardware activity for a particular GPU kernel. It also supports both
+a web-based GUI or command-line analyzer, depending on your preference.
+
+.. _fine-tuning-llms-profiling-omnitrace:
+
+Omnitrace
+---------
+
+`Omnitrace <https://rocm.github.io/omnitrace>`_ is a comprehensive profiling and tracing tool for parallel applications,
+including HPC and ML packages, written in C, C++, Fortran, HIP, OpenCL, and Python which execute on the CPU or CPU and
+GPU. It is capable of gathering the performance information of functions through any combination of binary
+instrumentation, call-stack sampling, user-defined regions, and Python interpreter hooks.
+
+Omnitrace supports interactive visualization of comprehensive traces in the web browser in addition to high-level
+summary profiles with ``mean/min/max/stddev`` statistics. Beyond runtime
+information, Omnitrace supports the collection of system-level metrics such as CPU frequency, GPU temperature, and GPU
+utilization. Process and thread level metrics such as memory usage, page faults, context switches, and numerous other
+hardware counters are also included.
+
+.. tip::
+
+   When analyzing the performance of an application, it is best not to assume you know where the performance
+   bottlenecks are and why they are happening. Omnitrace is the ideal tool for characterizing where optimization would
+   have the greatest impact on the end-to-end execution of the application and to discover what else is happening on the
+   system during a performance bottleneck.
+
+.. figure:: ../../data/how-to/fine-tuning-llms/omnitrace-timeline.png
+
+   Omnitrace timeline trace example.
+
+For details usage and examples of using these tools, refer to the
+`Introduction to profiling tools for AMD hardware <https://rocm.blogs.amd.com/software-tools-optimization/profilers/README.html>`_
+developer blog.
+
+Debugging with ROCm Debug Agent
+===============================
+
+ROCm Debug Agent (:doc:`ROCdebug-agent <rocr_debug_agent:index>`) is a library that can be loaded by the ROCm platform
+runtime (:doc:`ROCr <rocr-runtime:index>`) to provide the following functionalities for all AMD accelerators and GPUs
+supported by the ROCm Debugger API (:doc:`ROCdbgapi <rocdbgapi:index>`).
+
+* Print the state of all AMD accelerator or GPU wavefronts that caused a queue error; for example, causing a memory
+  violation, executing an ``s_trap2``, or executing an illegal instruction.
+
+* Print the state of all AMD accelerator or GPU wavefronts by sending a ``SIGQUIT`` signal to the process in question;
+  for example, by pressing ``Ctrl + \`` while the process is executing.
+
+Debugging memory access faults
+------------------------------
+
+Identifying a faulting kernel is often enough to triage a memory access fault. To that end, the
+`ROCm Debug Agent <https://github.com/ROCm/rocr_debug_agent/>`_ can trap a memory access fault and provide a dump of all
+active wavefronts that caused the error as well as the name of the kernel. The
+`AMD ROCm Debug Agent Library README <https://github.com/ROCm/rocr_debug_agent/blob/master/README.md>`_ provides full
+instructions, but in brief:
+
+*  Compiling with ``-ggdb -O0`` is recommended but not required.
+
+*  ``HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 HSA_ENABLE_DEBUG=1 ./my_program``
+
+When the debug agent traps the fault, it will produce an extremely
+verbose output of all wavefront registers and memory content.
+Importantly, it also prints something like:
+
+.. code-block:: shell
+
+   Disassembly for function vector_add_assert_trap(int*, int*, int*):
+
+   code object:
+   file:////rocm-debug-agent/build/test/rocm-debug-agent-test#offset=14309&size=31336
+
+   loaded at: [0x7fd4f100c000-0x7fd4f100e070]
+
+The kernel name and the code object file should be listed. In the
+example above, the kernel name is ``vector_add_assert_trap``, but this might
+also look like:
+
+.. code-block:: shell
+
+   Disassembly for function memory:///path/to/codeobject#offset=1234&size=567:
+
+In this case, it is an in-memory kernel that was generated at runtime.
+
+Using the following environment variable, the debug agent will save all code objects to the current directory (use
+``--save-code-objects=[DIR]`` to place them in another location). The code objects will be renamed from the URI format
+with special characters replaced by ``_``. 
+
+.. code-block:: shell
+
+   ROCM_DEBUG_AGENT_OPTIONS="--all --save-code-objects"
+
+Use the ``llvm-objdump`` command to disassemble the indicated in-memory
+code object that has now been saved to disk. The name of the kernel is
+often found inside the disassembled code object.
+
+.. code-block:: shell
+
+   llvm-objdump --disassemble-all path/to/code-object.co
+
+Consider turning off memory caching strategies both within the ROCm
+stack and PyTorch where possible. This will give the debug agent the
+best chance at finding the memory fault where it originates. Otherwise,
+it could be masked by writing past the end of a cached block within a
+larger allocation.
+
+.. code-block:: shell
+
+   PYTORCH_NO_HIP_MEMORY_CACHING=1
+
+   HSA_DISABLE_FRAGMENT_ALLOCATOR=1
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wang, Yanyao	66e8ada46c	Test ROCm 6.1.2 build	2024-06-05 14:48:12 -07:00
Wang, Yanyao	673f1eed48	Fix Markdown formate for the linter check	2024-06-05 10:41:25 -07:00
Wang, Yanyao	5642d4f7b0	Update the branch of ROCm repo after testing	2024-06-05 09:50:20 -07:00
Wang, Yanyao	7f93d1635d	Build ROCm from source	2024-06-04 21:20:04 -07:00