update commits

Update the manifest file for ROCm6.1.5
Merge develop into roc-6.1.x (#3440 )
2026-01-09 14:48:06 -05:00 · 2025-03-12 13:13:43 -07:00 · 2025-03-12 10:56:41 -07:00 · 2024-07-22 15:39:48 -06:00
82 changed files with 3145 additions and 810 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -87,7 +87,7 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=gfx942
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
        -DMIGRAPHX_USE_COMPOSABLEKERNEL=OFF
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -40,7 +40,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  pool: ${{ variables.LARGE_DISK_BUILD_POOL }}
  workspace:
    clean: all
  steps:
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -25,7 +25,7 @@ parameters:

 jobs:
 - job: composable_kernel
-  timeoutInMinutes: 210
+  timeoutInMinutes: 100
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -59,6 +59,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DINSTANCES_ONLY=ON
+        -DGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -77,7 +77,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DHIP_PLATFORM=amd
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -8,12 +8,13 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - ninja-build
-    - python3-venv
-    - libmsgpack-dev
+    - gfortran
    - git
-    - python3-pip
    - libdrm-dev
+    - libmsgpack-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
 - name: pipModules
  type: object
  default:
@@ -21,15 +22,16 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - hipBLAS
+    - llvm-project
    - rocminfo
    - rocprofiler-register
-    - hipBLAS
+    - ROCR-Runtime

 jobs:
 - job: hipBLASLt
+  timeoutInMinutes: 100
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -58,7 +60,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
-  # CI case: download latest default branch build
+# CI case: download latest default branch build
  - ${{ if eq(parameters.checkoutRef, '') }}:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -72,17 +74,42 @@ jobs:
        dependencySource: tag-builds
  - script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
    displayName: ROCm symbolic link
+# Build and install gtest, lapack, hipBLAS-common
+# $(Pipeline.Workspace)/deps is a temporary folder for the build process
+# $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
+  - script: mkdir $(Pipeline.Workspace)/deps
+# hipBLASLt already has a CMake script for external deps, so we can just run that
+# https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
+  - script: cmake $(Pipeline.Workspace)/s/deps
+    displayName: Configure hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+  - script: make
+    displayName: Build hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+  - script: sudo make install
+    displayName: Install hipBLASLt external dependencies
+    workingDirectory: $(Pipeline.Workspace)/deps
+# Set link to redirect llvm folder
+  - task: Bash@3
+    displayName: Symlink to rocm/lib/llvm
+    inputs:
+      targetType: inline
+      script: ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+  - script: sudo chmod 777 /mnt
+    displayName: 'Set permissions for /mnt'
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
+      cmakeBuildDir: /mnt/build
+      cmakeSourceDir: $(Pipeline.Workspace)/s
      extraBuildFlags: >-
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx90a
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LIBRARY_FORMAT=msgpack
        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -57,6 +57,6 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -66,7 +66,7 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -61,6 +61,6 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -74,7 +74,6 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
        -DBUILD_CLIENTS_TESTS=ON
        -DUSE_CUDA=OFF
        -GNinja
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -75,7 +75,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=all
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_LOGIC=
        -DTensile_CPU_THREADS=
        -DTensile_CODE_OBJECT_VERSION=default
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -58,6 +58,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
        -DHIPTENSOR_BUILD_TESTS=ON
-        -DAMDGPU_TARGETS=gfx90a
+        -DAMDGPU_TARGETS=gfx942
      multithreadFlag: -- -j32
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -72,6 +72,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TESTS=ON
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -67,7 +67,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -108,7 +108,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DTensile_CODE_OBJECT_VERSION=default
        -DTensile_LOGIC=asm_full
        -DTensile_SEPARATE_ARCHITECTURES=ON
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -64,7 +64,7 @@ jobs:
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DUSE_HIP_CLANG=ON
        -DHIP_COMPILER=clang
        -DBUILD_CLIENTS_TESTS=ON
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -59,7 +59,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_BENCHMARK=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_TEST=ON
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -59,6 +59,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DBUILD_TEST=ON
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -82,7 +82,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
        -DBUILD_CLIENTS_SAMPLES=OFF
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -68,7 +68,7 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_CLIENTS_SAMPLES=OFF
        -DBUILD_CLIENTS_TESTS=ON
        -DBUILD_CLIENTS_BENCHMARKS=OFF
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -60,7 +60,7 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -DBUILD_TEST=ON
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml

--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -65,7 +65,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DROCWMMA_BUILD_TESTS=ON
        -DROCWMMA_BUILD_SAMPLES=OFF
-        -DGPU_TARGETS=gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
 # gfx1030 not supported in documentation
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -66,6 +66,6 @@ jobs:
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DROCM_ROOT=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_HIP_ARCHITECTURES=gfx1030;gfx1100
+        -DCMAKE_HIP_ARCHITECTURES=gfx942
        -DCMAKE_EXE_LINKER_FLAGS=-fgpu-rdc
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -105,5 +105,5 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DENABLE_LDCONFIG=OFF
        -DUSE_PROF_API=1
-        -DGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=gfx942
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -65,6 +65,6 @@ jobs:
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DGPU_TARGETS=gfx1030;gfx1100
+        -DGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/components/rpp.yml
+++ b/.azuredevops/components/rpp.yml
@@ -60,6 +60,6 @@ jobs:
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DHALF_INCLUDE_DIRS=$(Agent.BuildDirectory)/rocm/include
        -DCMAKE_BUILD_TYPE=Release
-        -DAMDGPU_TARGETS=gfx1030;gfx1100
+        -DAMDGPU_TARGETS=gfx942
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -0,0 +1,115 @@
+parameters:
+# currently excludes clr and rocm-examples
+- name: rocmDependencies
+  type: object
+  default:
+    - AMDMIGraphX
+    - amdsmi
+    - aomp-extras
+    - aomp
+    - composable_kernel
+    - half
+    - HIP
+    - hipBLAS
+    - hipBLASLt
+    - hipCUB
+    - hipFFT
+    - hipfort
+    - HIPIFY
+    - hipRAND
+    - hipSOLVER
+    - hipSPARSE
+    - hipSPARSELt
+    - hipTensor
+    - llvm-project
+    - MIOpen
+    - MIVisionX
+    - rccl
+    - rdc
+    - rocAL
+    - rocALUTION
+    - rocBLAS
+    - ROCdbgapi
+    - rocDecode
+    - rocFFT
+    - ROCgdb
+    - rocm-cmake
+    - rocm-core
+    - rocminfo
+    - rocMLIR
+    - ROCmValidationSuite
+    - rocm_bandwidth_test
+    - rocm_smi_lib
+    - rocPRIM
+    - rocprofiler-register
+    - rocprofiler
+    - ROCR-Runtime
+    - rocRAND
+    - rocr_debug_agent
+    - rocSOLVER
+    - rocSPARSE
+    - ROCT-Thunk-Interface
+    - rocThrust
+    - roctracer
+    - rocWMMA
+    - rpp
+
+trigger: none
+pr: none
+schedules:
+- cron: '30 7 * * *'
+  displayName: Nightly build
+  branches:
+    include:
+    - develop
+  always: true
+
+jobs:
+- job: rocm_nightly
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - task: DeleteFiles@1
+    displayName: 'Cleanup checkout space'
+    inputs:
+      SourceFolder: '$(Agent.BuildDirectory)/s'
+      Contents: '**/*'
+  - task: DeleteFiles@1
+    displayName: 'Cleanup Staging Area'
+    inputs:
+      SourceFolder: '$(Build.ArtifactStagingDirectory)'
+      Contents: '/**/*'
+      RemoveDotFiles: true
+  - script: sudo chmod 777 /mnt
+    displayName: 'Set permissions for /mnt'
+  - script: df -h
+    displayName: System disk space before ROCm
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      dependencySource: staging
+      extractToMnt: true
+      skipLibraryLinking: true
+  - script: df -h
+    displayName: System disk space after ROCm
+  - script: du -sh /mnt/rocm
+    displayName: Uncompressed ROCm size
+  - task: ArchiveFiles@2
+    displayName: Compress rocm-nightly
+    inputs:
+      rootFolderOrFile: /mnt/rocm
+      includeRootFolder: false
+      archiveType: tar
+      tarCompression: gz
+      archiveFile: $(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildNumber)_ubuntu2204.tar.gz
+  - script: du -sh $(Build.ArtifactStagingDirectory)
+    displayName: Compressed ROCm size
+  - task: PublishPipelineArtifact@1
+    displayName: 'Public ROCm Nightly Artifact'
+    retryCountOnTaskFailure: 3
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -11,6 +11,9 @@ parameters:
 - name: cmakeBuildDir
  type: string
  default: 'build'
+- name: cmakeSourceDir
+  type: string
+  default: '..'
 - name: cmakeTarget
  type: string
  default: 'install'
@@ -35,9 +38,11 @@ steps:
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
    ${{ if eq(parameters.customInstallPath, true) }}:
-      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ..
+      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
    ${{ else }}:
      cmakeArgs: ${{ parameters.extraBuildFlags }} ..
+- script: df -h
+  displayName: Disk space before build
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
@@ -46,6 +51,8 @@ steps:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
    cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
    retryCountOnTaskFailure: 10
+- script: df -h
+  displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
 - ${{ if eq(parameters.installEnabled, true) }}:
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -21,6 +21,8 @@ variables:
  value: rocm-ci_ultra_build_pool
 - name: ON_PREM_BUILD_POOL
  value: rocm-ci_build_pool
+- name: LARGE_DISK_BUILD_POOL
+  value: rocm-ci_larger_base_disk_pool
 - name: LATEST_RELEASE_TAG
  value: rocm-6.1.0
 - name: DOCKER_IMAGE_NAME
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -2,6 +2,7 @@ AAC
 ABI
 ACE
 ACEs
+ACS
 AccVGPR
 AccVGPRs
 ALU
@@ -12,6 +13,7 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
+APBDIS
 APIC
 APIs
 APU
@@ -24,11 +26,13 @@ ATI
 AddressSanitizer
 AlexNet
 Arb
+BARs
 BLAS
 BMC
 BitCode
 Blit
 Bluefield
+Bootloader
 CCD
 CDNA
 CIFAR
@@ -43,6 +47,7 @@ CPF
 CPP
 CPU
 CPUs
+Cron
 CSC
 CSE
 CSV
@@ -62,7 +67,10 @@ CommonMark
 Concretized
 Conda
 ConnectX
+DDR
+DF
 DGEMM
+DIMM
 DKMS
 DL
 DMA
@@ -91,7 +99,9 @@ FFmpeg
 FHS
 FMA
 FP
+FX
 Filesystem
+FindDb
 Flang
 Fortran
 Fuyu
@@ -124,6 +134,7 @@ GitHub
 Gitpod
 HBM
 HCA
+HGX
 HIPCC
 HIPExtension
 HIPIFY
@@ -133,12 +144,14 @@ HPE
 HPL
 HSA
 HWE
+HWS
 Haswell
 Higgs
 Hyperparameters
 ICV
 IDE
 IDEs
+IFWI
 IMDb
 IOMMU
 IOP
@@ -148,6 +161,7 @@ IRQ
 ISA
 ISV
 ISVs
+ITL
 ImageNet
 InfiniBand
 Inlines
@@ -159,6 +173,7 @@ JSON
 Jupyter
 KFD
 KiB
+KV
 KVM
 Keras
 Khronos
@@ -193,6 +208,7 @@ MVFFR
 Makefile
 Makefiles
 Matplotlib
+Megatrends
 Megatron
 Mellanox
 Mellanox's
@@ -208,6 +224,7 @@ NIC
 NICs
 NLI
 NLP
+NPKit
 NPS
 NSP
 NUMA
@@ -240,16 +257,19 @@ OpenMP
 OpenMPI
 OpenSSL
 OpenVX
+PCC
 PCI
 PCIe
 PEFT
 PIL
 PILImage
+POR
 PRNG
 PRs
 PaLM
 Pageable
 PeerDirect
+PerfDb
 Perfetto
 PipelineParallel
 PnP
@@ -288,6 +308,7 @@ SBIOS
 SCA
 SDK
 SDMA
+SDPA
 SDRAM
 SENDMSG
 SGPR
@@ -309,10 +330,12 @@ SRAMECC
 SVD
 SWE
 SerDes
+ShareGPT
 Shlens
 Skylake
 Softmax
 Spack
+SplitK
 Supermicro
 Szegedy
 TCA
@@ -323,8 +346,12 @@ TCP
 TCR
 TF
 TFLOPS
+TP
 TPU
 TPUs
+TSME
+Tagram
+TensileLite
 TensorBoard
 TensorFlow
 TensorParallel
@@ -345,6 +372,7 @@ USM
 UTCL
 UTIL
 Uncached
+Unittests
 Unhandled
 VALU
 VBIOS
@@ -433,6 +461,7 @@ cuLIB
 cuRAND
 cuSOLVER
 cuSPARSE
+cTDP
 dataset
 datasets
 dataspace
@@ -467,6 +496,7 @@ executables
 ffmpeg
 filesystem
 fortran
+fp
 galb
 gcc
 gdb
@@ -480,6 +510,7 @@ gzip
 heterogenous
 hipBLAS
 hipBLASLt
+hipBLASLt's
 hipCUB
 hipFFT
 hipLIB
@@ -496,6 +527,7 @@ hipfort
 hipify
 hipsolver
 hipsparse
+hotspotting
 hpc
 hpp
 hsa
@@ -504,6 +536,7 @@ hyperparameter
 ib_core
 inband
 incrementing
+inductor
 inferencing
 inflight
 init
@@ -561,6 +594,8 @@ prebuilt
 precompiled
 prefetch
 prefetchable
+prefill
+prefills
 preprocess
 preprocessed
 preprocessing
@@ -685,6 +720,7 @@ writebacks
 wrreq
 wzo
 xargs
+xGMI
 xz
 yaml
 ysvmadyb
--- a/default.xml
+++ b/default.xml
@@ -1,13 +1,13 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.1.2"
+    <default revision="refs/tags/rocm-6.1.5"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
 <!--list of projects for ROCm-->
-    <project name="ROCK-Kernel-Driver" />
-    <project name="ROCR-Runtime" />
+    <project name="ROCK-Kernel-Driver" revision="80920ea8b3c3b9c0b65a464a7320953eb42aea39" />
+    <project name="ROCR-Runtime" revision="397fa1dd40228311fc15824ba15f8bfaac537fe9" />
    <project name="ROCT-Thunk-Interface" />
    <project name="amdsmi" />
    <project name="clang-ocl" />
@@ -22,7 +22,7 @@
 <!--HIP Projects-->
    <project name="HIP" />
    <project name="hip-tests" />
-    <project name="HIP-Examples" />
+    <project name="HIP-Examples" revision="41b0cff8077a25390c2bbda827eb9f6f37ec1ef3" />
    <project name="HIPIFY" />
    <project name="clr" />
    <project name="hipother" />
--- a/docs/compatibility/precision-support.rst
+++ b/docs/compatibility/precision-support.rst
@@ -416,7 +416,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
        - -/✅
        - -/✅
        - -/✅
@@ -428,7 +428,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
@@ -474,7 +474,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:api-reference/data-type-support>`)
        - -/❌
        - -/❌
        - -/✅
@@ -492,7 +492,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:api-reference/data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -10,7 +10,7 @@ GPU computational elements of the processor along with the lower levels of the c

 The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.

-```{figure} ../../data/conceptual/gpu-arch/image007.png
+```{figure} ../../data/shared/xcd-sys-arch.png
 ---
 name: mi300-xcd
 align: center
@@ -103,7 +103,7 @@ MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, w

 ## Node-level architecture

-```{figure} ../../data/conceptual/gpu-arch/image009.png
+```{figure} ../../data/shared/mi300-node-level-arch.png
 ---
 name: mi300-node

--- a/docs/conceptual/gpu-memory.md
+++ b/docs/conceptual/gpu-memory.md
@@ -51,7 +51,7 @@ In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault
 There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
 On MI200 these options do not impact performance.
 <!-- TODO: link to programming_manual#memory-allocation-flags -->
-For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
+For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:how-to/programming_manual`.
 :::

 Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -424,4 +424,8 @@ Shadow byte legend (one shadow byte represents 8 application bytes):

 * Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.

-* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+* It can also be the case that a memory fault is reported for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+
+* There is currently a bug which can result in memory faults being reported when running instrumented device code which makes use of `malloc`, `free`, `new`, or `delete`.
+
+* There is currently a bug which can result in undefined symbols being reported at compile time when instrumented device code makes use of `new` and `delete`.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,25 +5,10 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

 import shutil
-import jinja2
-import os

-# Environment to process Jinja templates.
-jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
-
-# Jinja templates to render out.
-templates = []
-
-# Render templates and output files without the last extension.
-# For example: 'install.md.jinja' becomes 'install.md'.
-for template in templates:
-    rendered = jinja_env.get_template(template).render()
-    with open(os.path.splitext(template)[0], 'w') as file:
-        file.write(rendered)
-
-shutil.copy2('../RELEASE.md','./about/release-notes.md')
 # Keep capitalization due to similar linking on GitHub's markdown preview.
-shutil.copy2('../CHANGELOG.md','./about/changelog.md')
+shutil.copy2("../RELEASE.md", "./about/release-notes.md")
+shutil.copy2("../CHANGELOG.md", "./about/changelog.md")

 latex_engine = "xelatex"
 latex_elements = {
@@ -46,25 +31,62 @@ all_article_info_author = ""

 # pages with specific settings
 article_pages = [
+    {"file": "about/release-notes", "os": ["linux", "windows"], "date": "2024-06-04"},
+    {"file": "about/changelog", "os": ["linux", "windows"], "date": "2024-06-04"},
+    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/train-a-model", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/deploy-your-model", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/hugging-face-models", "os": ["linux"]},
+    {"file": "how-to/rocm-for-hpc/index", "os": ["linux"]},
+    {"file": "how-to/llm-fine-tuning-optimization/index", "os": ["linux"]},
+    {"file": "how-to/llm-fine-tuning-optimization/overview", "os": ["linux"]},
    {
-        "file":"about/release-notes",
-        "os":["linux", "windows"],
-        "date":"2024-06-04"
+        "file": "how-to/llm-fine-tuning-optimization/fine-tuning-and-inference",
+        "os": ["linux"],
    },
    {
-        "file":"about/changelog",
-        "os":["linux", "windows"],
-        "date":"2024-06-04"
+        "file": "how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference",
+        "os": ["linux"],
    },
-
-    {"file":"how-to/deep-learning-rocm", "os":["linux"]},
-    {"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
-    {"file":"how-to/system-debugging", "os":["linux"]},
-    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},
+    {
+        "file": "how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/llm-inference-frameworks",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/model-acceleration-libraries",
+        "os": ["linux"],
+    },
+    {"file": "how-to/llm-fine-tuning-optimization/model-quantization", "os": ["linux"]},
+    {
+        "file": "how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/optimizing-triton-kernel",
+        "os": ["linux"],
+    },
+    {
+        "file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
+        "os": ["linux"],
+    },
+    {"file": "how-to/system-optimization/index", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
+    {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
+    {"file": "how-to/system-debugging", "os": ["linux"]},
+    {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
 ]

-exclude_patterns = ['temp']
-
 external_toc_path = "./sphinx/_toc.yml"

 extensions = ["rocm_docs", "sphinx_reredirects"]
@@ -79,10 +101,8 @@ html_css_files = ["rocm_custom.css"]

 html_title = "ROCm Documentation"

-html_theme_options = {
-    "link_main_doc": False
-}
+html_theme_options = {"link_main_doc": False}

-redirects = {
-    "reference/openmp/openmp": "../../about/compatibility/openmp.html"
-}
+redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
+
+numfig = False
--- a/docs/data/how-to/framework_install_2024_05_23.png
+++ b/docs/data/how-to/framework_install_2024_05_23.png
--- a/docs/data/how-to/framework_install_2024_07_04.png
+++ b/docs/data/how-to/framework_install_2024_07_04.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
--- a/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
--- a/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-access.png
--- a/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
+++ b/docs/data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
--- a/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
+++ b/docs/data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
--- a/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
+++ b/docs/data/how-to/tuning-guides/rocm-bandwidth-test.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showhw.png
--- a/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
+++ b/docs/data/how-to/tuning-guides/rocm-smi-showtopo.png
--- a/docs/data/how-to/tuning-guides/rocminfo.png
+++ b/docs/data/how-to/tuning-guides/rocminfo.png
--- a/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-config-yaml.png
--- a/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
+++ b/docs/data/how-to/tuning-guides/tensilelite-tuning-flow.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
--- a/docs/data/shared/mi300-node-level-arch.png
+++ b/docs/data/shared/mi300-node-level-arch.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
--- a/docs/data/shared/xcd-sys-arch.png
+++ b/docs/data/shared/xcd-sys-arch.png
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -19,7 +19,7 @@ The following guides cover installation processes for ROCm-aware deep learning f

 The following chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

-.. image:: ../data/how-to/framework_install_2024_05_23.png
+.. image:: ../data/how-to/framework_install_2024_07_04.png
   :alt: Flowchart for installing ROCm-aware machine learning frameworks
   :align: center

--- a/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
@@ -135,6 +135,8 @@ Installing vLLM

            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}

+Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.
+
 .. _fine-tuning-llms-tgi:

 Hugging Face TGI
--- a/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
@@ -8,6 +8,8 @@ Model acceleration libraries

 This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.

+.. _acceleration-flash-attention:
+
 Flash Attention 2
 =================

--- a/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
@@ -161,6 +161,7 @@ kernels by configuring the ``exllama_config`` parameter as the following.
                           base_model_name,
                           device_map="auto",
                           quantization_config=gptq_config)
+
 bitsandbytes
 ============

--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
@@ -6,378 +6,24 @@
 Optimizing Triton kernels
 *************************

-This section introduces the general steps for `Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
-Triton kernel optimization is similar to HIP and CUDA kernel optimization.
+This section introduces the general steps for 
+`Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
+Triton kernel optimization is similar to :doc:`HIP <hip:how-to/performance_guidelines>`
+and CUDA kernel optimization.

-.. _fine-tuning-llms-triton-memory-access-efficiency:
+Refer to the
+:ref:`Triton kernel performance optimization <mi300x-triton-kernel-performance-optimization>`
+section of the :doc:`/how-to/tuning-guides/mi300x/workload` guide
+for detailed information.

-Memory access efficiency
-========================
+Triton kernel performance optimization includes the following topics.

-The accelerator or GPU contains global memory, local data share (LDS), and registers. Global memory has high access
-latency, but is large. LDS access has much lower latency, but is smaller. Register access is the fastest yet smallest
-among the three.
+* :ref:`mi300x-autotunable-kernel-config`

-So, the data in global memory should be loaded and stored as few times as possible. If different threads in a block
-need to access the same data, these data should be first transferred from global memory to LDS, then accessed by
-different threads in a workgroup.
+* :ref:`mi300x-mlir-analysis`

-.. _fine-tuning-llms-triton-hardware-resource-utilization:
+* :ref:`mi300x-assembly-analysis`

-Hardware resource utilization
-=============================
+* :ref:`mi300x-torchinductor-tuning`

-Each accelerator or GPU has multiple Compute Units (CUs) and various CUs do computation in parallel. So, how many CUs
-can a compute kernel can allocate its task to? For the :doc:`AMD MI300X accelerator <../../reference/gpu-arch-specs>`, the
-grid should have at least 1024 thread blocks or workgroups.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/compute-unit.png
-
-   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.
-
-To increase hardware utilization and maximize parallelism, it is necessary to design algorithms that can exploit more
-parallelism. One approach to achieving this is by using larger split-K techniques for General Matrix Multiply (GEMM)
-operations, which can further distribute the computation across more CUs, thereby enhancing performance.
-
-.. tip::
-
-   You can query hardware resources with the command ``rocminfo`` (in the ``/opt/rocm/bin`` directory). For instance,
-   query the number of CUs, number of SIMD, and wavefront size using the following commands.
-
-   .. code-block:: shell
-
-      rocminfo | grep "Compute Unit"
-
-      rocminfo | grep "SIMD"
-
-      rocminfo | grep "Wavefront Size"
-
-   On an MI300X device, there are 304 CUs, 4 SIMD per CU, and the wavefront size (warp size) is 64. See :doc:`Hardware
-   specifications <../../reference/gpu-arch-specs>` for a full list of AMD accelerators and GPUs.
-
-.. _fine-tuning-llms-triton-ir-analysis:
-
-IR analysis
-===========
-
-In Triton, there are several layouts including *blocked*, *shared*, *sliced*, and *MFMA*.
-
-From the Triton GPU IR (intermediate representation), you can know in which memory each computation is
-performed. The following is a snippet of IR from the Flash Attention decode ``int4`` key-value program. It is to
-de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
-
-.. code-block::
-
-   %190 = tt.load %189 {cache = 1 : i32, evict = 1 : i32, isVolatile =
-   false} : tensor<1x64xi32, #blocked6> loc(#loc159)
-
-   %266 = arith.andi %190, %cst_28 : tensor<1x64xi32, #blocked6>
-   loc(#loc250)
-
-   %267 = arith.trunci %266 : tensor<1x64xi32, #blocked6> to
-   tensor<1x64xi16, #blocked6> loc(#loc251)
-
-   %268 = tt.bitcast %267 : tensor<1x64xi16, #blocked6> -> tensor<1x64xf16,
-   #blocked6> loc(#loc252)
-
-   %269 = triton_gpu.convert_layout %268 : (tensor<1x64xf16, #blocked6>) ->
-   tensor<1x64xf16, #shared1> loc(#loc252)
-
-   %270 = tt.trans %269 : (tensor<1x64xf16, #shared1>) -> tensor<64x1xf16,
-   #shared2> loc(#loc194)
-
-   %276 = triton_gpu.convert_layout %270 : (tensor<64x1xf16, #shared2>) ->
-   tensor<64x1xf16, #blocked5> loc(#loc254)
-
-   %293 = arith.mulf %276, %cst_30 : tensor<64x1xf16, #blocked5>
-   loc(#loc254)
-
-   %295 = arith.mulf %292, %294 : tensor<64x32xf16, #blocked5> loc(#loc264)
-
-   %297 = arith.addf %295, %296 : tensor<64x32xf16, #blocked5> loc(#loc255)
-
-   %298 = triton_gpu.convert_layout %297 : (tensor<64x32xf16, #blocked5>)
-   -> tensor<64x32xf16, #shared1> loc(#loc255)
-
-   %299 = tt.trans %298 : (tensor<64x32xf16, #shared1>) ->
-   tensor<32x64xf16, #shared2> loc(#loc196)
-
-   %300 = triton_gpu.convert_layout %299 : (tensor<32x64xf16, #shared2>) ->
-   tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth
-   = 4}>> loc(#loc197)
-
-From the IR, you can see ``i32`` data is loaded from global memory to registers. With a few element-wise operations in
-registers, then it is stored in shared memory for the transpose operation, which needs data movement across different
-threads. With the transpose done, it is loaded from LDS to register again, and with a few more element-wise operations,
-they are stored in LDS again. The last step is to load from LDS to registers and convert to the dot-operand layout.
-
-From the IR, you can see that it uses the LDS twice: one for the transpose, and the other to convert the blocked layout
-to a dot-operand layout.
-
-Assembly analysis
-=================
-
-In the ISA, ensure ``global_load_dwordx4`` is used, especially when the
-load happens in a loop.
-
-In most cases, the LDS load and store should use ``_b128`` as well to
-minimize the number of LDS access instructions. Note that upstream (or backend) might not have ``_b128`` LDS read/write,
-so it uses ``_b64``. For most cases, no matter if you use fork or upstream,
-the LDS access should have ``_b64`` vector width.
-
-The AMD ISA has the ``s_waitcnt`` instruction to synchronize the dependency
-of memory access and computations. The ``s_waitcnt`` instruction can
-have two signals, typically in the context of Triton:
-
-* ``lgkmcnt(n):`` `lgkm` stands for LDS, GDS, Constant and Message.
-
-  In this context, it is often related to LDS access. The number ``n`` here means the number of such accesses that can
-  be left out to continue. For example, 0 means all ``lgkm`` access must finish before continuing, and 1 means only 1
-  ``lgkm`` access can be still running asynchronously before proceeding.
-
-* ``vmcnt(n):`` `vm` means vector memory.
-
-  This happens when vector memory is accessed, for example, when global load moves from global memory to vector memory.
-  Again, the number ``n`` here means the number of accesses that can be left out to continue.
-
-Generally recommended guidelines are as follows.
-
-*  Vectorize memory access as much as possible.
-
-*  Ensure synchronization is done efficiently.
-
-*  Overlap of instructions to hide latency, but it requires thoughtful
-   analysis of the algorithms.
-
-*  If you find inefficiencies, you can trace it back to LLVM IR, TTGIR
-   and even TTIR to see where the problem comes from. If you find it
-   during compiler optimization, activate the MLIR dump and check which
-   optimization pass caused the problem.
-
-.. _fine-tuning-llms-triton-kernel-occupancy:
-
-Kernel occupancy
-================
-
-1. Get the VGPR count, search for ``.vgpr_count`` in the ISA (for example, ``N``).
-
-2. Get the allocated LDS following the steps (for example, L for the kernel).
-
-   a. ``export MLIR_ENABLE_DUMP=1``
-
-   b. ``rm -rf ~/.triton/cache``
-
-   c. ``python kernel.py | | grep "triton_gpu.shared = " | tail -n 1``
-
-   d. You should see something like ``triton_gpu.shared = 65536``, indicating 65536 bytes of LDS are allocated for the
-      kernel.
-
-3. Get number of waves per workgroup using the following steps (for example, ``nW``).
-
-   a. ``export MLIR_ENABLE_DUMP=1``
-
-   b. ``rm -rf ~/.triton/cache``
-
-   c. ``python kernel.py | | grep "triton_gpu.num-warps " | tail -n 1``
-
-   d. You should see something like ``“triton_gpu.num-warps" = 8``, indicating 8 waves per workgroup.
-
-4. Compute occupancy limited by VGPR based on N according to the following table. For example, waves per EU as
-   ``occ_vgpr``.
-
-.. _fine-tuning-llms-occupancy-vgpr-table:
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
-   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
-   :align: center
-
-5. Compute occupancy limited by LDS based on L by: ``occ_lds = floor(65536 / L)``.
-
-6. Then the occupancy is ``occ = min(floor(occ_vgpr * 4 / nW), occ_lds) * nW / 4``
-
-   a. ``occ_vgpr \* 4`` gives the total number of waves on all 4 execution units (SIMDs)
-      per CU.
-
-   b. ``floor(occ_vgpr * 4 / nW)`` gives the occupancy of workgroups per CU
-      regrading VGPR usage.
-
-   c. The true ``occ`` is the minimum of the two.
-
-.. _fine-tuning-llms-triton-kernel-configs-env-vars:
-
-Auto-tunable kernel configurations and environment variables
-============================================================
-
-This section relates to the amount of :ref:`memory access <fine-tuning-llms-triton-memory-access-efficiency>` and
-computation assigned to each CU. It is related to the usage of LDS, registers and the scheduling of different tasks on
-a CU.
-
-The following is a list of kernel arguments used for tuning.
-
-``num_stages=n``
-   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
-   according to the following rules:
-
-   * For kernels with a single GEMM, set to ``0``.
-
-   * For kernels with two GEMMs fused (Flash Attention, or any other kernel
-     that fuses 2 GEMMs), set to ``1``.
-
-   * For kernels that fuse a single GEMM with another non-GEMM operator
-     (for example ReLU activation), set to ``0``.
-
-   * For kernels that have no GEMMs, set to ``1``.
-
-``waves_per_eu=n``
-   Helps to manage Vector General Purpose Registers (VGPR) usage to achieve desired occupancy levels. This argument
-   hints to the compiler to reduce VGPR to achieve ``n`` occupancy. See
-   :ref:`Kernel occupancy <fine-tuning-llms-triton-kernel-occupancy>` for more information about how to compute
-   occupancy. 
-
-   This argument is useful if:
-
-   * The occupancy of the kernel is limited by VGPR usage.
-
-   * The current VGPR usage is only a few above a boundary in
-     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <fine-tuning-llms-occupancy-vgpr-table>`.
-
-   For example, according to the table, the available VGPR is 512 per Execution Unit (EU), and VGPU is allocated at the
-   unit of 16. If the current VGPR usage is 170, the actual requested VGPR will be 176, so the
-   occupancy is only 2 waves per CU since :math:`176 \times 3 > 512`. So, if you set
-   ``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
-   that it might fit 3 waves per EU.
-
-``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
-   Tile sizes to be tuned to balance the memory-to-computation ratio. You want tile sizes large enough to
-   maximize the efficiency of memory-to-computation ratio, but small enough to parallelize the greatest number of
-   workgroups at the grid level.
-
-``matrix_instr_nonkdim``
-   Experimental feature for Flash Attention-like kernels that determines the size of the Matrix Fused Multiply-Add
-   (MFMA) instruction used.
-
-   -  ``Matrix_instr_nonkdim = 16``: ``mfma_16x16`` is used.
-
-   -  ``Matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.
-
-   For GEMM kernels on an AMD MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
-   tile/GEMM sizes.
-
-The following is an environment variable used for tuning.
-
-``OPTIMIZE_EPILOGUE``
-   Setting this variable to ``1`` can improve performance by removing the ``convert_layout`` operation in the epilogue.
-   It should be turned on (set to ``1``) in most cases. Setting ``OPTIMIZE_EPILOGUE=1`` stores the MFMA instruction
-   results in the MFMA layout directly; this comes at the cost of reduced global store efficiency, but the impact on
-   kernel execution time is usually minimal.
-
-   By default (``0``), the results of MFMA instruction are converted to blocked layout, which leads to ``global_store``
-   with maximum vector length, that is ``global_store_dwordx4``.
-
-   This is done implicitly with LDS as the intermediate buffer to achieve
-   data exchange between threads. Padding is used in LDS to avoid bank
-   conflicts. This usually leads to extra LDS usage, which might reduce
-   occupancy.
-
-   .. note::
-
-      This variable is not turned on by default because it only
-      works with ``tt.store`` but not ``tt.atomic_add``, which is used in split-k and
-      stream-k GEMM kernels. In the future, it might be enabled with
-      ``tt.atomic_add`` and turned on by default.
-
-   See :ref:`IR analysis <fine-tuning-llms-triton-ir-analysis>`.
-
-TorchInductor with Triton tuning knobs
-===========================================
-
-The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (``conv``) operations in PyTorch
-using ``inductor``, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better
-performance.
-
-Learn more about TorchInductor environment variables and usage in
-`PyTorch documentation <https://pytorch.org/docs/2.3/torch.compiler_inductor_profiling.html>`_.
-
-To enable a ``gemm``/``conv`` lowering to Triton, it requires use of ``inductor``’s ``max_autotune`` mode. This benchmarks a
-static list of Triton configurations (``conv`` configurations for max auto-tune + ``matmul`` configurations for max
-auto-tune) and uses the fastest for each shape. Note that the Triton is not used if regular :doc:`MIOpen <miopen:index>`
-or :doc:`rocBLAS <rocblas:index>` is faster for a specific operation.
-
-* Set ``torch._inductor.config.max_autotune = True`` or ``TORCHINDUCTOR_MAX_AUTOTUNE=1``.
-
-* Or, for more fine-grained control:
-
-  ``torch._inductor.config.max_autotune.pointwise = True``
-     To enable tuning for ``pointwise``/``reduction`` ops.
-
-  ``torch._inductor.config.max_autotune_gemm = True``
-     To enable tuning or lowering of ``mm``/``conv``\s.
-
-  ``torch._inductor.max_autotune_gemm_backends/TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS``
-     To select the candidate backends for ``mm`` auto-tuning. Defaults to
-     ``TRITON,ATEN,NV``. This also includes the ``CUTLASS`` tuning option. Limiting this to
-     ``TRITON`` might improve performance by enabling more fused ``mm`` kernels
-     instead of going to rocBLAS.
-
-* For ``mm`` tuning, tuning ``coordinate_descent`` might improve performance.
-
-  ``torch._inductor.config.coordinate_descent_tuning = True`` or ``TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1``
-
-* Inference can see large improvements on AMD GPUs by utilizing
-  ``torch._inductor.config.freezing=True`` or the ``TORCHINDUCTOR_FREEZING=1`` variable, which
-  in-lines weights as constants and enables constant folding optimizations.
-
-* Enabling ``inductor``’s cpp_wrapper might improve overhead. This generates
-  C++ code which launches Triton binaries directly with
-  ``hipModuleLaunchKernel`` and relies on `hipification`.
-
-* For NHWC convolutions workloads
-  ``torch._inductor.config.layout_optimization=True`` or ``TORCHINDUCTOR_LAYOUT_OPTIMIZATION=``
-  can help be enforcing channels_last format throughout the graph avoiding
-  any additional transposes added by ``inductor``. Note that
-  ``PYTORCH_MIOPEN_SUGGEST_NHWC=1`` is recommended if using this.
-
-* Extracting the Triton kernel ``TORCH_COMPILE_DEBUG`` creates a
-  ``torch_compile_debug/`` directory at current path, in the ``output_code.py``
-  the code-strings for the Triton kernels that are defined. Manual work is
-  then required to strip out the kernel and create kernel
-  compilation and launch via Triton.
-
-Other guidelines
-================
-
-* Performance-critical HIP provides an environment variable, ``export HIP_FORCE_DEV_KERNARG=1``,
-  that can put HIP kernel arguments directly to
-  device memory to reduce the latency of accessing kernel arguments. It
-  can reduce 2 to 3 μs for some kernels. Setting this variable for the FA
-  decode containing ``splitK`` and reduced kernels can reduce the total time
-  by around 6 μs in the benchmark test.
-
-* Set the clock to deterministic. Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed to
-  1900MHz instead of the default 2100MHz. This can reduce the chance of clock speed decrease due to chip high temperature
-  by setting a lower cap. You can restore this setting to its default value with ``rocm-smi -r``.
-
-* Set Non-Uniform Memory Access (NUMA) auto-balance. Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the
-  current setting. An output of ``0`` indicates this setting is available. If output is ``1``, run the command
-  ``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to set this.
-
-For these settings, the ``env_check.sh`` script automates the setting, resetting, and checking of the such
-environments. Find the script at `<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
-
-.. _fine-tuning-llms-triton-tunableop:
-
-TunableOp
---------
-`TunableOp <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
-is a feature used to define and optimize kernels that can have tunable parameters. This is useful in
-optimizing the performance of custom kernels by exploring different parameter configurations to find the most efficient
-setup. See more about PyTorch TunableOp :ref:`Model acceleration libraries <fine-tuning-llms-pytorch-tunableop>`.
-
-You can easily manipulate the behavior TunableOp through environment variables, though you could use the C++ interface
-``at::cuda::tunable::getTuningContext()``. A Python interface to the ``TuningContext`` does not yet exist.
-
-The default value is ``0``, which means only 1 iteration is attempted. Remember: there’s an overhead to tuning. To try
-and minimize the overhead, only a limited number of iterations of a given operation are attempted. If you set this to
-``10``, each solution for a given operation can run as many iterations as possible within 10ms. There is a hard-coded
-upper limit of 100 iterations attempted per solution. This is a tuning parameter; if you want the tunings to be chosen
-based on an average over multiple iterations, increase the allowed tuning duration.
+* :ref:`mi300x-compute-kernel-occ`
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
@@ -6,7 +6,7 @@

 # Optimizing with Composable Kernel

-The AMD ROCm&trade; Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
+The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.

 This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.

--- a/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
@@ -6,212 +6,24 @@
 Profiling and debugging
 ***********************

-This section discusses profiling and debugging tools and some of their common usage patterns with ROCm applications.
+This section provides an index for further documentation on  profiling and
+debugging tools and their common usage patterns.

-PyTorch Profiler
-================
+See :ref:`AMD Instinct MI300X™ workload optimization <mi300x-profiling-start>`
+for a conceptual summary of the workload profiling workflow for ROCm applications
+on AMD hardware -- including fine-tuning LLMs.

-`PyTorch Profiler <https://pytorch.org/docs/stable/profiler.html>`_ can be invoked inside Python scripts, letting you
-collect CPU and GPU performance metrics while the script is running. See the `PyTorch Profiler tutorial
-<https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ for more information.
+There, you'll find information on higher-level and kernel-level profiling tools
+as well as other profiling and debugging suggestions.

-You can then visualize and view these metrics using an open-source profile visualization tool like
-`Perfetto UI <https://ui.perfetto.dev>`_.
+* :ref:`PyTorch Profiler <mi300x-pytorch-profiler>`

-#. Use the following snippet to invoke PyTorch Profiler in your code.
+* :ref:`ROCm profiling tools <mi300x-profiling-tools>`

-   .. code-block:: python
+  * :ref:`ROCProfiler <mi300x-rocprof>`

-      import torch
-      import torchvision.models as models
-      from torch.profiler import profile, record_function, ProfilerActivity
-      model = models.resnet18().cuda()
-      inputs = torch.randn(2000, 3, 224, 224).cuda()
-      
-      with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-          with record_function("model_inference"):
-              model(inputs)
-      prof.export_chrome_trace("resnet18_profile.json")
+  * :ref:`Omniperf <mi300x-omniperf>`

-#. Profile results in ``resnet18_profile.json`` can be viewed by the Perfetto visualization tool. Go to
-   `<https://ui.perfetto.dev>`__ and import the file. In your Perfetto visualization, you'll see that the upper section
-   shows transactions denoting the CPU activities that launch GPU kernels while the lower section shows the actual GPU
-   activities where it processes the ``resnet18`` inferences layer by layer. 
-
-   .. figure:: ../../data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
-      
-      Perfetto trace visualization example.
-
-ROCm profiling tools
-====================
-
-Heterogenous systems, where programs run on both CPUs and GPUs, introduce additional complexities. Understanding the
-critical path and kernel execution is all the more important; so, performance tuning is a necessary component in the
-benchmarking process.
-
-With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
-using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
-working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these are:
-
-* :ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>`
-* :ref:`Omniperf <fine-tuning-llms-profiling-omniperf>`
-* :ref:`Omnitrace <fine-tuning-llms-profiling-omnitrace>`
-
-.. _fine-tuning-llms-profiling-rocprof:
-
-ROCProfiler
-----------
-:doc:`ROCProfiler <rocprofiler:index>` is primarily a low-level API for accessing and extracting GPU hardware performance
-metrics, commonly called *performance counters*. These counters quantify the performance of the underlying architecture
-showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.
-
-Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
-available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
-their execution.
-
-This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
-ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.
-
-.. note::
-
-   ``rocprof`` is a CLI-only utility so input and output takes the format of ``.txt`` and CSV files. These
-   formats provide a raw view of the data and puts the onus on the user to parse and analyze. Therefore, ``rocprof``
-   gives the user full access and control of raw performance profiling data, but requires extra effort to analyze the
-   collected data.
-
-.. _fine-tuning-llms-profiling-omniperf:
-
-Omniperf
--------
-`Omniperf <https://rocm.github.io/omniperf>`_ is a system performance profiler for high-performance computing (HPC) and
-machine learning (ML) workloads using Instinct accelerators. Under the hood, Omniperf uses
-:ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>` to collect hardware performance counters. The Omniperf tool performs
-system profiling based on all approved hardware counters for Instinct
-accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
-block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.
-
-Omniperf takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
-to collect and analyze raw CSV output files as is the case with ROC-profiler. Instead, Omniperf automates the collection
-of all available hardware counters in one command and provides a graphical interface to help users understand and
-analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.
-
-.. note::
-
-   Omniperf collects hardware counters in multiple passes, and will therefore re-run the application during each pass
-   to collect different sets of metrics.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
-
-   Omniperf memory chat analysis panel.
-
-In brief, Omniperf provides details about hardware activity for a particular GPU kernel. It also supports both
-a web-based GUI or command-line analyzer, depending on your preference.
-
-.. _fine-tuning-llms-profiling-omnitrace:
-
-Omnitrace
---------
-
-`Omnitrace <https://rocm.github.io/omnitrace>`_ is a comprehensive profiling and tracing tool for parallel applications,
-including HPC and ML packages, written in C, C++, Fortran, HIP, OpenCL, and Python which execute on the CPU or CPU and
-GPU. It is capable of gathering the performance information of functions through any combination of binary
-instrumentation, call-stack sampling, user-defined regions, and Python interpreter hooks.
-
-Omnitrace supports interactive visualization of comprehensive traces in the web browser in addition to high-level
-summary profiles with ``mean/min/max/stddev`` statistics. Beyond runtime
-information, Omnitrace supports the collection of system-level metrics such as CPU frequency, GPU temperature, and GPU
-utilization. Process and thread level metrics such as memory usage, page faults, context switches, and numerous other
-hardware counters are also included.
-
-.. tip::
-
-   When analyzing the performance of an application, it is best not to assume you know where the performance
-   bottlenecks are and why they are happening. Omnitrace is the ideal tool for characterizing where optimization would
-   have the greatest impact on the end-to-end execution of the application and to discover what else is happening on the
-   system during a performance bottleneck.
-
-.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
-
-   Omnitrace timeline trace example.
-
-For details usage and examples of using these tools, refer to the
-`Introduction to profiling tools for AMD hardware <https://rocm.blogs.amd.com/software-tools-optimization/profilers/README.html>`_
-developer blog.
-
-Debugging with ROCr Debug Agent
-===============================
-
-:doc:`ROCr Debug Agent <rocr_debug_agent:index>`) is a library that can be loaded by the ROCm platform
-runtime (:doc:`ROCr <rocr-runtime:index>`) to provide the following functionalities for all AMD accelerators and GPUs
-supported by the ROCm Debugger API (:doc:`ROCdbgapi <rocdbgapi:index>`).
-
-* Print the state of all AMD accelerator or GPU wavefronts that caused a queue error; for example, causing a memory
-  violation, executing an ``s_trap2``, or executing an illegal instruction.
-
-* Print the state of all AMD accelerator or GPU wavefronts by sending a ``SIGQUIT`` signal to the process in question;
-  for example, by pressing ``Ctrl + \`` while the process is executing.
-
-Debugging memory access faults
------------------------------
-
-Identifying a faulting kernel is often enough to triage a memory access fault. To that end, the
-`ROCr Debug Agent <https://github.com/ROCm/rocr_debug_agent/>`_ can trap a memory access fault and provide a dump of all
-active wavefronts that caused the error as well as the name of the kernel. The
-`ROCr Debug Agent Library README <https://github.com/ROCm/rocr_debug_agent/blob/master/README.md>`_ provides full
-instructions, but in brief:
-
-*  Compiling with ``-ggdb -O0`` is recommended but not required.
-
-*  ``HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 HSA_ENABLE_DEBUG=1 ./my_program``
-
-When the debug agent traps the fault, it will produce an extremely
-verbose output of all wavefront registers and memory content.
-Importantly, it also prints something like:
-
-.. code-block:: shell
-
-   Disassembly for function vector_add_assert_trap(int*, int*, int*):
-
-   code object:
-   file:////rocm-debug-agent/build/test/rocm-debug-agent-test#offset=14309&size=31336
-
-   loaded at: [0x7fd4f100c000-0x7fd4f100e070]
-
-The kernel name and the code object file should be listed. In the
-example above, the kernel name is ``vector_add_assert_trap``, but this might
-also look like:
-
-.. code-block:: shell
-
-   Disassembly for function memory:///path/to/codeobject#offset=1234&size=567:
-
-In this case, it is an in-memory kernel that was generated at runtime.
-
-Using the following environment variable, the debug agent will save all code objects to the current directory (use
-``--save-code-objects=[DIR]`` to place them in another location). The code objects will be renamed from the URI format
-with special characters replaced by ``_``. 
-
-.. code-block:: shell
-
-   ROCM_DEBUG_AGENT_OPTIONS="--all --save-code-objects"
-
-Use the ``llvm-objdump`` command to disassemble the indicated in-memory
-code object that has now been saved to disk. The name of the kernel is
-often found inside the disassembled code object.
-
-.. code-block:: shell
-
-   llvm-objdump --disassemble-all path/to/code-object.co
-
-Consider turning off memory caching strategies both within the ROCm
-stack and PyTorch where possible. This will give the debug agent the
-best chance at finding the memory fault where it originates. Otherwise,
-it could be masked by writing past the end of a cached block within a
-larger allocation.
-
-.. code-block:: shell
-
-   PYTORCH_NO_HIP_MEMORY_CACHING=1
-
-   HSA_DISABLE_FRAGMENT_ALLOCATOR=1
+  * :ref:`Omnitrace <mi300x-omnitrace>`

+* :ref:`ROCr Debug Agent <mi300x-rocr-debug-agent>`
--- a/docs/how-to/system-debugging.md
+++ b/docs/how-to/system-debugging.md
@@ -5,7 +5,7 @@
  ROCm">
 </head>

-# System debugging guide
+# System debugging

 ## ROCm language and system-level debug, flags, and environment variables

@@ -65,4 +65,4 @@ Debug messages when developing/debugging base ROCm driver. You could enable the

 ## PCIe-debug

-For information on how to debug and profile HIP applications, see {doc}`hip:how_to_guides/debugging`
+For information on how to debug and profile HIP applications, see {doc}`hip:how-to/debugging`
--- a/docs/how-to/system-optimization/index.rst
+++ b/docs/how-to/system-optimization/index.rst
@@ -0,0 +1,109 @@
+.. meta::
+   :description: AMD hardware optimization for specific workloads
+   :keywords: high-performance computing, HPC, Instinct accelerators, Radeon,
+              tuning, tuning guide, AMD, ROCm
+
+*******************
+System optimization
+*******************
+
+This guide outlines system setup and tuning suggestions for AMD hardware to
+optimize performance for specific types of workloads or use-cases.
+
+High-performance computing workloads
+====================================
+
+High-performance computing (HPC) workloads have unique requirements. The default
+hardware and BIOS configurations for OEM platforms may not provide optimal
+performance for HPC workloads. To enable optimal HPC settings on a per-platform
+and per-workload level, this chapter describes:
+
+* BIOS settings that can impact performance
+* Hardware configuration best practices
+* Supported versions of operating systems
+* Workload-specific recommendations for optimal BIOS and operating system
+  settings
+
+There is also a discussion on the AMD Instinct™ software development
+environment, including information on how to install and run the DGEMM, STREAM,
+HPCG, and HPL benchmarks. This guide provides a good starting point but is
+not tested exhaustively across all compilers.
+
+Knowledge prerequisites to better understand this document and to perform tuning
+for HPC applications include:
+
+* Experience in configuring servers
+* Administrative access to the server's Management Interface (BMC)
+* Administrative access to the operating system
+* Familiarity with the OEM server's BMC (strongly recommended)
+* Familiarity with the OS specific tools for configuration, monitoring, and
+  troubleshooting (strongly recommended)
+
+This document provides guidance on tuning systems with various AMD Instinct
+accelerators for HPC workloads. The following sections don't comprise an
+all-inclusive guide, and some items referred to may have similar, but different,
+names in various OEM systems (for example, OEM-specific BIOS settings). This
+following sections also provide suggestions on items that should be the initial
+focus of additional, application-specific tuning.
+
+While this guide is a good starting point, developers are encouraged to perform
+their own performance testing for additional tuning.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - System optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Instinct MI300X <mi300x>`
+
+     - `AMD Instinct MI300 instruction set architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_
+
+     - `CDNA 3 architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI200 <mi200>`
+
+     - `AMD Instinct MI200 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf>`_
+
+     - `CDNA 2 architecture <https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI100 <mi100>`
+
+     - `AMD Instinct MI100 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf>`_
+
+     - `CDNA architecture <https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf>`_
+
+Workstation workloads
+=====================
+
+Workstation workloads, much like those for HPC, have a unique set of
+requirements: a blend of both graphics and compute, certification, stability and
+others.
+
+The document covers specific software requirements and processes needed to use
+these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
+tasks.
+
+The main purpose of this document is to help users utilize the RDNA™ 2 GPUs to
+their full potential.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - System optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Radeon PRO W6000 and V620 <w6000-v620>`
+
+     - `AMD RDNA 2 instruction set architecture <https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf>`_
+
+     - `RDNA 2 architecture <https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf>`_
+
--- a/docs/how-to/system-optimization/mi100.md
+++ b/docs/how-to/system-optimization/mi100.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="MI100 high-performance computing and tuning guide">
-  <meta name="keywords" content="MI100, high-performance computing, HPC, tuning, BIOS
+  <meta name="keywords" content="MI100, high-performance computing, HPC, BIOS
  settings, NBIO, AMD, ROCm">
 </head>

-# MI100 high-performance computing and tuning guide
+# AMD Instinct MI100 system optimization

 ## System settings

--- a/docs/how-to/system-optimization/mi200.md
+++ b/docs/how-to/system-optimization/mi200.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="MI200 high-performance computing and tuning guide">
-  <meta name="keywords" content="MI200, high-performance computing, HPC, tuning, BIOS
+  <meta name="keywords" content="MI200, high-performance computing, HPC, BIOS
  settings, NBIO, AMD, ROCm">
 </head>

-# MI200 high-performance computing and tuning guide
+# AMD Instinct MI200 system optimization

 ## System settings

--- a/docs/how-to/system-optimization/mi300x.rst
+++ b/docs/how-to/system-optimization/mi300x.rst
@@ -0,0 +1,804 @@
+.. meta::
+   :description: AMD Instinct MI300X system settings
+   :keywords: AMD, Instinct, MI300X, HPC, tuning, BIOS settings, NBIO, ROCm,
+              environment variable, performance, accelerator, GPU, EPYC, GRUB,
+              operating system
+
+***************************************
+AMD Instinct MI300X system optimization
+***************************************
+
+This document covers essential system settings and management practices required
+to configure your system effectively. Ensuring that your system operates
+correctly is the first step before delving into advanced performance tuning.
+
+The main topics of discussion in this document are:
+
+* :ref:`System settings <mi300x-system-settings>`
+
+  * :ref:`System BIOS settings <mi300x-bios-settings>`
+
+  * :ref:`GRUB settings <mi300x-grub-settings>`
+
+  * :ref:`Operating system settings <mi300x-os-settings>`
+
+* :ref:`System management <mi300x-system-management>`
+
+.. _mi300x-system-settings:
+
+System settings
+===============
+
+This guide discusses system settings that are required to configure your system
+for AMD Instinct™ MI300X accelerators. It is important to ensure a system is
+functioning correctly before trying to improve its overall performance. In this
+section, the settings discussed mostly ensure proper functionality of your
+Instinct-based system. Some settings discussed are known to improve performance
+for most applications running on a MI300X system. See
+:doc:`/how-to/tuning-guides/mi300x/workload` for how to improve performance for
+specific applications or workloads.
+
+.. _mi300x-bios-settings:
+
+System BIOS settings
+--------------------
+
+AMD EPYC 9004-based systems
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For maximum MI300X GPU performance on systems with AMD EPYC™ 9004-series
+processors and AMI System BIOS, the following configuration
+of system BIOS settings has been validated. These settings must be used for the
+qualification process and should be set as default values in the system BIOS.
+Analogous settings for other non-AMI System BIOS providers could be set
+similarly. For systems with Intel processors, some settings may not apply or be
+available as listed in the following table.
+
+Each row in the table details a setting but the specific location within the
+BIOS setup menus may be different, or the option may not be present. 
+
+.. list-table::
+   :header-rows: 1
+
+   * - BIOS setting location
+
+     - Parameter
+
+     - Value
+
+     - Comments
+
+   * - Advanced / PCI subsystem settings
+
+     - Above 4G decoding
+
+     - Enabled
+
+     - GPU large BAR support.
+
+   * - Advanced / PCI subsystem settings
+
+     - SR-IOV support
+
+     - Enabled
+
+     - Enable single root IO virtualization.
+
+   * - AMD CBS / GPU common options
+
+     - Global C-state control
+
+     - Auto
+
+     - Global C-states -- do not disable this menu item).
+
+   * - AMD CBS / GPU common options
+
+     - CCD/Core/Thread enablement
+
+     - Accept
+
+     - May be necessary to enable the SMT control menu.
+
+   * - AMD CBS / GPU common options / performance
+
+     - SMT control
+
+     - Disable
+
+     - Set to Auto if the primary application is not compute-bound.
+
+   * - AMD CBS / DF common options / memory addressing
+
+     - NUMA nodes per socket
+
+     - Auto
+
+     - Auto = NPS1. At this time, the other options for NUMA nodes per socket
+       should not be used.
+
+   * - AMD CBS / DF common options / memory addressing
+
+     - Memory interleaving
+
+     - Auto
+
+     - Depends on NUMA nodes (NPS) setting.
+
+   * - AMD CBS / DF common options / link
+
+     - 4-link xGMI max speed
+
+     - 32 Gbps
+
+     - Auto results in the speed being set to the lower of the max speed the
+       motherboard is designed to support and the max speed of the CPU in use.
+
+   * - AMD CBS / NBIO common options
+
+     - IOMMU
+
+     - Enabled
+
+     - 
+
+   * - AMD CBS / NBIO common options
+
+     - PCIe ten bit tag support
+
+     - Auto
+
+     - 
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - Determinism control
+
+     - Manual
+
+     - 
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - Determinism slider
+
+     - Power
+
+     - 
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - cTDP control
+
+     - Manual
+
+     - Set cTDP to the maximum supported by the installed CPU.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - cTDP
+
+     - 400
+
+     - Value in watts.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - Package power limit control
+
+     - Manual
+
+     - Set package power limit to the maximum supported by the installed CPU.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - Package power limit
+
+     - 400
+
+     - Value in watts.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - xGMI link width control
+
+     - Manual
+
+     - Set package power limit to the maximum supported by the installed CPU.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - xGMI force width control
+
+     - Force
+
+     - 
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - xGMI force link width
+
+     - 2
+
+     - * 0: Force xGMI link width to x2
+       * 1: Force xGMI link width to x8
+       * 2: Force xGMI link width to x16
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - xGMI max speed
+
+     - Auto
+
+     - Auto results in the speed being set to the lower of the max speed the
+       motherboard is designed to support and the max speed of the CPU in use.
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - APBDIS
+
+     - 1
+
+     - Disable DF (data fabric) P-states
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - DF C-states
+
+     - Auto
+
+     - 
+
+   * - AMD CBS / NBIO common options / SMU common options
+
+     - Fixed SOC P-state
+
+     - P0
+
+     - 
+
+   * - AMD CBS / security
+
+     - TSME
+
+     - Disabled
+
+     - Memory encryption
+
+.. _mi300x-grub-settings:
+
+GRUB settings
+-------------
+
+In any modern Linux distribution, the ``/etc/default/grub`` file is used to
+configure GRUB. In this file, the string assigned to ``GRUB_CMDLINE_LINUX`` is
+the command line parameters that Linux uses during boot.
+
+Appending strings via Linux command line
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It is recommended to append the following strings in ``GRUB_CMDLINE_LINUX``.
+
+``pci=realloc=off``
+  With this setting Linux is able to unambiguously detect all GPUs of the
+  MI300X-based system because this setting disables the automatic reallocation
+  of PCI resources. It's used when Single Root I/O Virtualization (SR-IOV) Base
+  Address Registers (BARs) have not been allocated by the BIOS. This can help
+  avoid potential issues with certain hardware configurations.
+
+``iommu=pt``
+  The ``iommu=pt`` setting enables IOMMU pass-through mode. When in pass-through
+  mode, the adapter does not need to use DMA translation to the memory, which can
+  improve performance.
+
+IOMMU is a system specific IO mapping mechanism and can be used for DMA mapping
+and isolation. This can be beneficial for virtualization and device assignment
+to virtual machines. It is recommended to enable IOMMU support.
+
+For a system that has AMD host CPUs add this to ``GRUB_CMDLINE_LINUX``:
+
+.. code-block:: text
+
+   amd_iommu=on iommu=pt
+
+Otherwise, if the system has Intel host CPUs add this instead to
+``GRUB_CMDLINE_LINUX``:
+
+.. code-block:: text
+
+   intel_iommu=on iommu=pt
+
+Update GRUB
+-----------
+
+Update GRUB to use the modified configuration:
+
+.. code-block:: shell
+
+   sudo grub2-mkconfig -o /boot/grub2/grub.cfg
+
+On some Debian systems, the ``grub2-mkconfig`` command may not be available. Instead,
+check for the presence of ``grub-mkconfig``. Additionally, verify that you have the
+correct version by using the following command:
+
+.. code-block:: shell
+
+   grub-mkconfig -version
+
+.. _mi300x-os-settings:
+
+Operating system settings
+-------------------------
+
+CPU core states (C-states)
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are several core states (C-states) that an AMD EPYC CPU can idle within:
+
+* **C0**: active. This is the active state while running an application.
+
+* **C1**: idle. This state consumes less power compared to C0, but can quickly
+  return to the active state (C0) with minimal latency.
+
+* **C2**: idle and power-gated. This is a deeper sleep state and will have greater
+  latency when moving back to the active (C0) state as compared to when the CPU
+  is coming out of C1.
+
+Disabling C2 is important for running with a high performance, low-latency
+network. To disable the C2 state, install the ``cpupower`` tool using your Linux
+distribution's package manager. ``cpupower`` is not a base package in most Linux
+distributions. The specific package to be installed varies per Linux
+distribution.
+
+.. tab-set::
+
+   .. tab-item:: Ubuntu
+      :sync: ubuntu
+
+      .. code-block:: shell
+
+         sudo apt install linux-tools-common
+
+   .. tab-item:: RHEL
+      :sync: rhel
+
+      .. code-block:: shell
+
+         sudo yum install cpupowerutils
+
+   .. tab-item:: SLES
+      :sync: sles
+
+      .. code-block:: shell
+
+         sudo zypper install cpupower
+
+Now, to disable power-gating on all cores run the following on Linux
+systems, run the following command.
+
+.. code-block:: shell
+
+   cpupower idle-set -d 2
+
+`/proc` and `/sys` file system settings
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. _mi300x-disable-numa:
+
+Disable NUMA auto-balancing
+'''''''''''''''''''''''''''
+
+The NUMA balancing feature allows the OS to scan memory and attempt to migrate
+to a DIMM that is logically closer to the cores accessing it. This causes an
+overhead because the OS is second-guessing your NUMA allocations but may be
+useful if the NUMA locality access is very poor. Applications can therefore, in
+general, benefit from disabling NUMA balancing; however, there are workloads where
+doing so is detrimental to performance. Test this setting
+by toggling the ``numa_balancing`` value and running the application; compare
+the performance of one run with this set to ``0`` and another run with this to
+``1``.
+
+Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the current
+NUMA (Non-Uniform Memory Access) settings. Output ``0`` indicates this
+setting is disabled. If no output or output is ``1``, run the command
+``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to disable it.
+
+For these settings, the ``env_check.sh`` script automates setting, resetting,
+and checking your environments. Find the script at
+`<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
+
+Run the script as follows to set or reset the settings:
+
+``./env_check.sh [set/reset/check]``
+
+.. tip::
+
+   Use ``./env_check.sh -h`` for help info.
+
+Automate disabling NUMA auto-balance using Cron
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :ref:`mi300x-disable-numa` section describes the command to disable NUMA
+auto-balance. To automate the command with Cron, edit the ``crontab``
+configuration file for the root user:
+
+.. code-block:: shell
+
+   sudo crontab -e
+
+#. Add the following Cron entry to run the script at a specific interval:
+
+   .. code-block:: shell
+
+      @reboot sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+
+#. Save the file and exit the text editor.
+
+#. Optionally, restart the system to apply changes by issuing ``sudo reboot``.
+
+#. Verify your new configuration.
+
+   .. code-block::
+
+      cat /proc/sys/kernel/numa_balancing
+
+   The ``/proc/sys/kernel/numa_balancing`` file controls NUMA balancing in the
+   Linux kernel. If the value in this file is set to ``0``, the NUMA balancing
+   is disabled. If the value is set to ``1``, NUMA balancing is enabled.
+
+.. note::
+
+   Disabling NUMA balancing should be done cautiously and for
+   specific reasons, such as performance optimization or addressing
+   particular issues. Always test the impact of disabling NUMA balancing in
+   a controlled environment before applying changes to a production system.
+
+.. _mi300x-env-vars:
+
+Environment variables
+^^^^^^^^^^^^^^^^^^^^^
+
+HIP provides an environment variable export ``HIP_FORCE_DEV_KERNARG=1`` that
+can put arguments of HIP kernels directly to device memory to reduce the
+latency of accessing those kernel arguments. It can improve performance by 2 to
+3 µs for some kernels.
+
+It is recommended to set the following environment variable:
+
+.. code-block:: shell
+
+   export HIP_FORCE_DEV_KERNARG=1
+
+.. note::
+
+   This is the default option as of ROCm 6.2.
+
+IOMMU configuration -- systems with 256 CPU threads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For systems that have 256 logical CPU cores or more, setting the input-output
+memory management unit (IOMMU) configuration to ``disabled`` can limit the
+number of available logical cores to 255. The reason is that the Linux kernel
+disables X2APIC in this case and falls back to Advanced Programmable Interrupt
+Controller (APIC), which can only enumerate a maximum of 255 (logical) cores.
+
+If SMT is enabled by setting ``CCD/Core/Thread Enablement > SMT Control`` to
+``enable``, you can apply the following steps to the system to enable all
+(logical) cores of the system:
+
+#. In the server BIOS, set IOMMU to ``Enabled``.
+
+#. When configuring the GRUB boot loader, add the following arguments for the Linux kernel: ``amd_iommu=on iommu=pt``.
+
+#. Update GRUB.
+
+#. Reboot the system.
+
+#. Verify IOMMU passthrough mode by inspecting the kernel log via ``dmesg``:
+
+   .. code-block::
+
+      dmesg | grep iommu
+
+.. code-block:: shell
+
+   [...]
+   [   0.000000] Kernel command line: [...] amd_iommu=on iommu=pt
+   [...]
+
+Once the system is properly configured, ROCm software can be
+:doc:`installed <rocm-install-on-linux:index>`.
+
+.. _mi300x-system-management:
+
+System management
+=================
+
+To optimize system performance, it's essential to first understand the existing
+system configuration parameters and settings. ROCm offers several CLI tools that
+can provide system-level information, offering valuable insights for
+optimizing user applications.
+
+For a complete guide on how to install, manage, or uninstall ROCm on Linux, refer to
+:doc:`rocm-install-on-linux:tutorial/quick-start`. For verifying that the
+installation was successful, refer to the
+:doc:`rocm-install-on-linux:how-to/native-install/post-install`.
+Should verification fail, consult :doc:`/how-to/system-debugging`.
+
+Hardware verification with ROCm
+-------------------------------
+
+The ROCm platform provides tools to query the system structure. These include
+:ref:`ROCm SMI <mi300x-rocm-smi>` and :ref:`ROCm Bandwidth Test <mi300x-bandwidth-test>`.
+
+.. _mi300x-rocm-smi:
+
+ROCm SMI
+^^^^^^^^
+
+To query your GPU hardware, use the ``rocm-smi`` command. ROCm SMI lists
+GPUs available to your system -- with their device ID and their respective
+firmware (or VBIOS) versions.
+
+The following screenshot shows that all 8 GPUs of MI300X are recognized by ROCm.
+Performance of an application could be otherwise suboptimal if, for example, out
+of the 8 GPUs only 5 of them are recognized.
+
+.. image:: ../../data/how-to/tuning-guides/rocm-smi-showhw.png
+   :align: center
+   :alt: ``rocm-smi --showhw`` output
+
+To see the system structure, the localization of the GPUs in the system, and the
+fabric connections between the system components, use the command
+``rocm-smi --showtopo``.
+
+.. image:: ../../data/how-to/tuning-guides/rocm-smi-showtopo.png
+   :align: center
+   :alt: ``rocm-smi --showtopo`` output
+
+The first block of the output shows the distance between the GPUs similar to
+what the ``numactl`` command outputs for the NUMA domains of a system. The
+weight is a qualitative measure for the “distance” data must travel to reach one
+GPU from another one. While the values do not carry a special, or "physical"
+meaning, the higher the value the more hops are needed to reach the destination
+from the source GPU. This information has performance implication for a
+GPU-based application that moves data among GPUs. You can choose a minimum
+distance among GPUs to be used to make the application more performant.
+
+The second block has a matrix named *Hops between two GPUs*, where:
+
+* ``1`` means the two GPUs are directly connected with xGMI,
+
+* ``2`` means both GPUs are linked to the same CPU socket and GPU communications
+  will go through the CPU, and
+
+* ``3`` means both GPUs are linked to different CPU sockets so communications will
+  go through both CPU sockets. This number is one for all GPUs in this case
+  since they are all connected to each other through the Infinity Fabric links.
+
+The third block outputs the link types between the GPUs. This can either be
+``XGMI`` for AMD Infinity Fabric links or ``PCIE`` for PCIe Gen5 links.
+
+The fourth block reveals the localization of a GPU with respect to the NUMA
+organization of the shared memory of the AMD EPYC processors.
+
+To query the compute capabilities of the GPU devices, use rocminfo command. It
+lists specific details about the GPU devices, including but not limited to the
+number of compute units, width of the SIMD pipelines, memory information, and
+instruction set architecture (ISA). The following is the truncated output of the
+command:
+
+.. image:: ../../data/how-to/tuning-guides/rocminfo.png
+   :align: center
+   :alt: rocminfo.txt example
+
+For a complete list of architecture (such as CDNA3) and LLVM target names
+(such gfx942 for MI300X), refer to the
+:doc:`Supported GPUs section of the System requirements for Linux page <rocm-install-on-linux:reference/system-requirements>`.
+
+
+Deterministic clock
+'''''''''''''''''''
+
+Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock
+speed up to 1900 MHz instead of the default 2100 MHz. This can reduce
+the chance of a PCC event lowering the attainable GPU clocks. This
+setting will not be required for new IFWI releases with the production
+PRC feature. Restore this setting to its default value with the
+``rocm-smi -r`` command.
+
+.. _mi300x-bandwidth-test:
+
+ROCm Bandwidth Test
+^^^^^^^^^^^^^^^^^^^
+
+The section Hardware verification with ROCm showed how the command
+``rocm-smi --showtopo`` can be used to view the system structure and how the
+GPUs are connected. For more details on the link bandwidth,
+``rocm-bandwidth-test`` can run benchmarks to show the effective link bandwidth
+between the components of the system.
+
+You can install ROCm Bandwidth Test, which can test inter-device bandwidth,
+using the following package manager commands:
+
+.. tab-set::
+
+   .. tab-item:: Ubuntu
+      :sync: ubuntu
+
+      .. code-block:: shell
+
+         sudo apt install rocm-bandwidth-test
+
+   .. tab-item:: RHEL
+      :sync: rhel
+
+      .. code-block:: shell
+
+         sudo yum install rocm-bandwidth-test
+
+   .. tab-item:: SLES
+      :sync: sles
+
+      .. code-block:: shell
+
+         sudo zypper install rocm-bandwidth-test
+
+Alternatively, you can download the source code from
+`<https://github.com/ROCm/rocm_bandwidth_test>`__ and build from source.
+
+The output will list the available compute devices (CPUs and GPUs), including
+their device ID and PCIe ID. The following screenshot is an example of the
+beginning part of the output of running ``rocm-bandwidth-test``. It shows the
+devices present in the system.
+
+.. image:: ../../data/how-to/tuning-guides/rocm-bandwidth-test.png
+   :align: center
+   :alt: rocm-bandwidth-test sample output
+
+The output will also show a matrix that contains a ``1`` if a device can
+communicate to another device (CPU and GPU) of the system and it will show the
+NUMA distance -- similar to ``rocm-smi``.
+
+Inter-device distance:
+
+.. figure:: ../../data/how-to/tuning-guides/rbt-inter-device-access.png
+   :align: center
+   :alt: rocm-bandwidth-test inter-device distance
+
+   Inter-device distance
+
+Inter-device NUMA distance:
+
+.. figure:: ../../data/how-to/tuning-guides/rbt-inter-device-numa-distance.png
+   :align: center
+   :alt: rocm-bandwidth-test inter-device NUMA distance
+
+   Inter-device NUMA distance
+
+The output also contains the measured bandwidth for unidirectional and
+bidirectional transfers between the devices (CPU and GPU):
+
+Unidirectional bandwidth:
+
+.. figure:: ../../data/how-to/tuning-guides/rbt-unidirectional-bandwidth.png
+   :align: center
+   :alt: rocm-bandwidth-test unidirectional bandwidth
+
+   Unidirectional bandwidth
+
+Bidirectional bandwidth
+
+.. figure:: ../../data/how-to/tuning-guides/rbt-bidirectional-bandwidth.png
+   :align: center
+   :alt: rocm-bandwidth-test bidirectional bandwidth
+
+   Bidirectional bandwidth
+
+Acronyms
+========
+
+AMI
+  American Megatrends International
+
+APBDIS
+  Algorithmic Performance Boost Disable
+
+ATS
+  Address Translation Services
+
+BAR
+  Base Address Register
+
+BIOS
+  Basic Input/Output System
+
+CBS
+  Common BIOS Settings
+
+CLI
+  Command Line Interface
+
+CPU
+  Central Processing Unit
+
+cTDP
+  Configurable Thermal Design Power
+
+DDR5
+  Double Data Rate 5 DRAM
+
+DF
+  Data Fabric
+
+DIMM
+  Dual In-line Memory Module
+
+DMA
+  Direct Memory Access
+
+DPM
+  Dynamic Power Management
+
+GPU
+  Graphics Processing Unit
+
+GRUB
+  Grand Unified Bootloader
+
+HPC
+  High Performance Computing
+
+IOMMU
+  Input-Output Memory Management Unit
+
+ISA
+  Instruction Set Architecture
+
+LCLK
+  Link Clock Frequency
+
+NBIO
+  North Bridge Input/Output
+
+NUMA
+  Non-Uniform Memory Access
+
+PCC
+  Power Consumption Control
+
+PCI
+  Peripheral Component Interconnect
+
+PCIe
+  PCI Express
+
+POR
+  Power-On Reset
+
+SIMD
+  Single Instruction, Multiple Data
+
+SMT
+  Simultaneous Multi-threading
+
+SMI
+  System Management Interface
+
+SOC
+  System On Chip
+
+SR-IOV
+  Single Root I/O Virtualization
+
+TP
+  Tensor Parallelism
+
+TSME
+  Transparent Secure Memory Encryption
+
+X2APIC
+  Extended Advanced Programmable Interrupt Controller
+
+xGMI
+  Inter-chip Global Memory Interconnect 
--- a/docs/how-to/system-optimization/w6000-v620.md
+++ b/docs/how-to/system-optimization/w6000-v620.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="RDNA2 workstation tuning guide">
-  <meta name="keywords" content="RDNA2, workstation tuning, BIOS settings, installation, AMD,
+  <meta name="keywords" content="RDNA2, workstation, BIOS settings, installation, AMD,
  ROCm">
 </head>

-# RDNA2 workstation tuning guide
+# AMD RDNA2 system optimization

 ## System settings

--- a/docs/how-to/tuning-guides.md
+++ b/docs/how-to/tuning-guides.md
@@ -1,108 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="AMD hardware optimization for specific workloads">
-  <meta name="keywords" content="high-performance computing, HPC, Instinct accelerators,
-  Radeon, tuning, tuning guide, AMD, ROCm">
-</head>
-
-# System optimization
-
-This guide outlines system setup and tuning suggestions for AMD hardware to optimize performance for specific types of
-workloads or use-cases.
-
-## High-performance computing
-
-High-performance computing (HPC) workloads have unique requirements. The default
-hardware and BIOS configurations for OEM platforms may not provide optimal
-performance for HPC workloads. To enable optimal HPC settings on a per-platform
-and per-workload level, this guide calls out:
-
-* BIOS settings that can impact performance
-* Hardware configuration best practices
-* Supported versions of operating systems
-* Workload-specific recommendations for optimal BIOS and operating system
-  settings
-
-There is also a discussion on the AMD Instinct™ software development
-environment, including information on how to install and run the DGEMM, STREAM,
-HPCG, and HPL benchmarks. This guidance provides a good starting point but is
-not exhaustively tested across all compilers.
-
-Prerequisites to understanding this document and to performing tuning of HPC
-applications include:
-
-* Experience in configuring servers
-* Administrative access to the server's Management Interface (BMC)
-* Administrative access to the operating system
-* Familiarity with the OEM server's BMC (strongly recommended)
-* Familiarity with the OS specific tools for configuration, monitoring, and
-  troubleshooting (strongly recommended)
-
-This document provides guidance on tuning systems with various AMD Instinct™
-accelerators for HPC workloads. This document is not an all-inclusive guide, and
-some items referred to may have similar, but different, names in various OEM
-systems (for example, OEM-specific BIOS settings). This document also provides
-suggestions on items that should be the initial focus of additional,
-application-specific tuning.
-
-This document is based on the AMD EPYC™ 7003-series processor family (former
-codename "Milan").
-
-While this guide is a good starting point, developers are encouraged to perform
-their own performance testing for additional tuning.
-
-:::::{grid} 1 1 2 2
-:gutter: 1
-
-:::{grid-item-card}
-**[AMD Instinct™ MI200](./tuning-guides/mi200)**
-
-This chapter goes through how to configure your AMD Instinct™ MI200 accelerated
-compute nodes to get the best performance out of them.
-
-* [Instruction Set Architecture (ISA)](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
-* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
-
-:::
-
-:::{grid-item-card}
-**[AMD Instinct™ MI100](./tuning-guides/mi100)**
-
-This chapter briefly reviews hardware aspects of the AMD Instinct™ MI100
-accelerators and the CDNA™ 1 architecture that is the foundation of these GPUs.
-
-* [ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
-* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
-
-:::
-
-:::::
-
-## Workstation
-
-Workstation workloads, much like high-performance computing, have a unique set of
-requirements, a blend of both graphics and compute, certification, stability and
-the list continues.
-
-The document covers specific software requirements and processes needed to use
-these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
-(ML).
-
-The main purpose of this document is to help users utilize the RDNA 2 GPUs to
-their full potential.
-
-:::::{grid} 1 1 2 2
-:gutter: 1
-
-:::{grid-item-card}
-**[AMD Radeon™ PRO W6000 and V620](./tuning-guides/w6000-v620)**
-
-This chapter describes the AMD GPUs with RDNA™ 2 architecture, namely AMD Radeon
-PRO W6800 and AMD Radeon PRO V620
-
-* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
-* [White paper](https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf)
-
-:::
-
-:::::
--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -0,0 +1,13 @@
+************************
+AMD MI300X tuning guides
+************************
+
+The tuning guides in this section provide a comprehensive summary of the
+necessary steps to properly configure your system for AMD Instinct™ MI300X
+accelerators. They include detailed instructions on system settings and
+application tuning suggestions to help you fully leverage the capabilities of
+these accelerators, thereby achieving optimal performance.
+
+* :doc:`/how-to/tuning-guides/mi300x/system`
+
+* :doc:`/how-to/tuning-guides/mi300x/workload`
--- a/docs/how-to/tuning-guides/mi300x/system.rst
+++ b/docs/how-to/tuning-guides/mi300x/system.rst
@@ -0,0 +1,22 @@
+***************************************
+AMD Instinct MI300X system optimization
+***************************************
+
+The :doc:`/how-to/system-optimization/mi300x` guide discusses system settings that are
+required to configure your system for AMD Instinct™ MI300X accelerators.
+Some settings discussed are known to improve performance for most applications
+running on an MI300X system.
+
+Topics discussed therein include:
+
+* :ref:`System BIOS settings <mi300x-bios-settings>`
+
+* :ref:`GRUB settings <mi300x-grub-settings>`
+
+* :ref:`Operating system settings <mi300x-os-settings>`
+
+* :ref:`System management <mi300x-system-management>`
+
+For a look into improving performance for specific applications or workloads,
+see :doc:`/how-to/tuning-guides/mi300x/workload`.
+
--- a/docs/how-to/tuning-guides/mi300x/workload.rst
+++ b/docs/how-to/tuning-guides/mi300x/workload.rst
--- a/docs/index.md
+++ b/docs/index.md
@@ -75,7 +75,7 @@ Our documentation is organized into the following categories:
  * [HIP runtime](#hip-runtime)
 * [Tools](./reference/rocm-tools.md)
  * [Development](#development-tools)
-  * [Performance analysis](#performance-analysis)
+  * [Performance analysis](#performance-tools)
  * [System](#system-tools)
 * [Hardware specifications](./reference/gpu-arch-specs.rst)
 :::
@@ -89,17 +89,21 @@ Our documentation is organized into the following categories:
 * [Using ROCm for AI](./how-to/rocm-for-ai/index.rst)
 * [Using ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
 * [Fine-tuning LLMs and inference optimization](./how-to/llm-fine-tuning-optimization/index.rst)
-* [System tuning for various architectures](./how-to/tuning-guides.md)
-  * [MI100](./how-to/tuning-guides/mi100.md)
-  * [MI200](./how-to/tuning-guides/mi200.md)
-  * [RDNA2](./how-to/tuning-guides/w6000-v620.md)
+* [System optimization](./how-to/system-optimization/index.rst)
+  * [AMD Instinct MI300X](./how-to/system-optimization/mi300x.rst)
+  * [AMD Instinct MI200](./how-to/system-optimization/mi200.md)
+  * [AMD Instinct MI100](./how-to/system-optimization/mi100.md)
+  * [AMD Instinct RDNA2](./how-to/system-optimization/w6000-v620.md)
+* [AMD Instinct MI300X tuning guides](./how-to/tuning-guides/mi300x/index.rst)
+  * [System tuning](./how-to/tuning-guides/mi300x/system.rst)
+  * [Workload tuning](./how-to/tuning-guides/mi300x/workload.rst)
+* [System debugging](./how-to/system-debugging.md)
 * [GPU-enabled MPI](./how-to/gpu-enabled-mpi.rst)
 * [Using compiler features](./conceptual/compiler-topics.md)
  * [Using AddressSanitizer](./conceptual/using-gpu-sanitizer.md)
  * [Compiler disambiguation](./conceptual/compiler-disambiguation.md)
  * [OpenMP support in ROCm](./about/compatibility/openmp.md)
 * [Setting the number of CUs](./how-to/setting-cus)  
-* [System level debugging](./how-to/system-debugging.md)
 * [GitHub examples](https://github.com/amd/rocm-examples)
 :::

--- a/docs/reference/rocm-tools.md
+++ b/docs/reference/rocm-tools.md
@@ -53,7 +53,6 @@
 * {doc}`ROCm Data Center Tool <rdc:index>`
 * {doc}`ROCm SMI <rocm_smi_lib:index>`
 * {doc}`ROCm Validation Suite <rocmvalidationsuite:index>`
-* {doc}`TransferBench <transferbench:index>`
 :::

 ::::
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -81,16 +81,27 @@ subtrees:
      - file: how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
        title: Optimizing Triton kernels
      - file: how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
-  - file: how-to/tuning-guides.md
+  - file: how-to/system-optimization/index.rst
    title: System optimization
    subtrees:
    - entries:
-      - file: how-to/tuning-guides/mi100.md
-        title: MI100
-      - file: how-to/tuning-guides/mi200.md
-        title: MI200
-      - file: how-to/tuning-guides/w6000-v620.md
-        title: RDNA2
+      - file: how-to/system-optimization/mi300x.rst
+        title: AMD Instinct MI300X
+      - file: how-to/system-optimization/mi200.md
+        title: AMD Instinct MI200
+      - file: how-to/system-optimization/mi100.md
+        title: AMD Instinct MI100
+      - file: how-to/system-optimization/w6000-v620.md
+        title: AMD RDNA 2
+  - file: how-to/tuning-guides/mi300x/index.rst
+    title: AMD MI300X tuning guides
+    subtrees:
+    - entries:
+      - file: how-to/tuning-guides/mi300x/system.rst
+        title: System tuning
+      - file: how-to/tuning-guides/mi300x/workload.rst
+        title: Workload tuning
+  - file: how-to/system-debugging.md
  - file: how-to/gpu-enabled-mpi.rst
    title: Using MPI
  - file: conceptual/compiler-topics.md
@@ -105,8 +116,6 @@ subtrees:
        title: OpenMP support
  - file: how-to/setting-cus
    title: Setting the number of CUs  
-  - file: how-to/system-debugging.md
-    title: Debugging
  - url: https://github.com/amd/rocm-examples
    title: GitHub examples

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.4.1
+rocm-docs-core==1.5.0
 sphinx-reredirects
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -16,7 +16,7 @@ beautifulsoup4==4.12.3
    # via pydata-sphinx-theme
 breathe==4.35.0
    # via rocm-docs-core
-certifi==2024.6.2
+certifi==2024.7.4
    # via requests
 cffi==1.16.0
    # via
@@ -92,7 +92,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.4.1
+rocm-docs-core==1.5.0
    # via -r requirements.in
 smmap==5.0.1
    # via gitdb
@@ -122,7 +122,7 @@ sphinx-external-toc==1.0.1
    # via rocm-docs-core
 sphinx-notfound-page==1.0.2
    # via rocm-docs-core
-sphinx-reredirects==0.1.4
+sphinx-reredirects==0.1.5
    # via -r requirements.in
 sphinxcontrib-applehelp==1.0.8
    # via sphinx
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -111,7 +111,6 @@ Tools
  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
  ":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
  ":doc:`ROCr Debug Agent <rocr_debug_agent:index>`", "Prints the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running"
-  ":doc:`TransferBench <transferbench:index>`", "Utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs)"

 Compilers
 -----------------------------------------------
--- a/tools/autotag/util/init.py
+++ b/tools/autotag/util/init.py
@@ -1,2 +1,2 @@
 from .defaults import TEMPLATES, PROCESSORS
-from .custom_templates import hipfort, mivisionx, rpp
+from .custom_templates import hipfort, hipify, mivisionx, rpp, rvs
--- a/tools/autotag/util/custom_templates/hipfort.py
+++ b/tools/autotag/util/custom_templates/hipfort.py
@@ -5,10 +5,10 @@ from util.defaults import TEMPLATES, PROCESSORS

 TEMPLATES['hipfort'] = (
    (
-        r"## hipfort (?P<lib_version>\d+\.\d+(?:\.\d+))?"
-        r"(?P<for_rocm> for ROCm )?"
-        r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
-        r"( \(Unreleased\))?"
+        r"## hipfort"
+        r"(?: (?P<lib_version>\d+\.\d+(?:\.\d+))?)?"
+        r"(?: for ROCm (?P<rocm_version>\d+\.\d+(?:\.\d+)?))?"
+        r"(?: \(Unreleased\))?"
        r"\n"
        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
    )
@@ -21,19 +21,20 @@ def hipfort_processor(data: ReleaseLib, template: str, _, __) -> bool:
    changelog = changelog.decoded_content.decode()
    pattern = re.compile(template)
    match = pattern.search(changelog)
-    lib_version  = match["lib_version"]
+    lib_version  = match["rocm_version"]
+    
    data.message = (
        f"hipfort for ROCm"
        f" {data.full_version}"
    )
-
-    data.lib_version = lib_version
    data.notes = f"""{match["body"]}"""
+    data.lib_version = lib_version
    
    change_pattern = re.compile(
        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
        re.RegexFlag.MULTILINE
    )
+
    for match in change_pattern.finditer(data.notes):
        data.data.changes[match["type"]] = match["change"]
    
--- a/tools/autotag/util/custom_templates/hipify.py
+++ b/tools/autotag/util/custom_templates/hipify.py
@@ -3,31 +3,32 @@ import re
 from util.release_data import ReleaseLib
 from util.defaults import TEMPLATES, PROCESSORS

-TEMPLATES['composable_kernel'] = (
+TEMPLATES['HIPIFY'] = (
    (
-        r"## (\(Unreleased\))? CK (?P<lib_version>\d+\.\d+(?:\.\d+))?"
-        r"(?P<for_rocm> for ROCm )?"
-        r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
+        r"## HIPIFY"
+        r"(?: (?P<lib_version>\d+\.\d+(?:\.\d+))?)?"
+        r"(?: for ROCm (?P<rocm_version>\d+\.\d+(?:\.\d+)?))?"
+        r"(?: ?\(Unreleased\))?"
        r"\n"
        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
    )
 )


-def composable_kernel_processor(data: ReleaseLib, template: str, _, __) -> bool:
+def hipify_processor(data: ReleaseLib, template: str, _, __) -> bool:
    """Processor for releases."""
    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
    changelog = changelog.decoded_content.decode()
    pattern = re.compile(template)
    match = pattern.search(changelog)
-    lib_version  = match["lib_version"]
+    lib_version  = match["rocm_version"]
+
    data.message = (
-        f"composable_kernel for ROCm"
+        f"HIPIFY for ROCm"
        f" {data.full_version}"
    )
-
-    data.lib_version = lib_version
    data.notes = f"""{match["body"]}"""
+    data.lib_version = lib_version
    
    change_pattern = re.compile(
        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
@@ -38,4 +39,4 @@ def composable_kernel_processor(data: ReleaseLib, template: str, _, __) -> bool:
    
    return True

-PROCESSORS['composable_kernel'] = composable_kernel_processor
+PROCESSORS['HIPIFY'] = hipify_processor
--- a/tools/autotag/util/custom_templates/rvs.py
+++ b/tools/autotag/util/custom_templates/rvs.py
@@ -0,0 +1,42 @@
+import re
+
+from util.release_data import ReleaseLib
+from util.defaults import TEMPLATES, PROCESSORS
+
+TEMPLATES['ROCmValidationSuite'] = (
+    (
+        r"## RVS"
+        r"(?: (?P<lib_version>\d+\.\d+(?:\.\d+))?)?"
+        r"(?: for ROCm (?P<rocm_version>\d+\.\d+(?:\.\d+)?))?"
+        r"(?: ?\(Unreleased\))?"
+        r"\n"
+        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
+    )
+)
+
+
+def rvs_processor(data: ReleaseLib, template: str, _, __) -> bool:
+    """Processor for releases."""
+    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
+    changelog = changelog.decoded_content.decode()
+    pattern = re.compile(template)
+    match = pattern.search(changelog)
+    lib_version  = match["rocm_version"]
+
+    data.message = (
+        f"RVS for ROCm"
+        f" {data.full_version}"
+    )
+    data.lib_version = lib_version
+    data.notes = f"""{match["body"]}"""
+    
+    change_pattern = re.compile(
+        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
+        re.RegexFlag.MULTILINE
+    )
+    for match in change_pattern.finditer(data.notes):
+        data.data.changes[match["type"]] = match["change"]
+    
+    return True
+
+PROCESSORS['ROCmValidationSuite'] = rvs_processor