Update documentation requirements

Merge pull request #3568 from amd-jnovotny/peak-tflops-typo-docs610
Fix typo for TFLOPs metric in MI250 architecture page: cherry pick to docs/6.1.0
2026-01-09 22:58:17 -05:00 · 2024-09-16 10:13:11 -08:00 · 2024-08-12 13:17:44 -04:00 · 2024-08-12 10:24:24 -04:00 · 2024-08-07 12:42:50 -04:00 · 2024-08-06 15:56:06 -04:00
87 changed files with 7415 additions and 1019 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,4 @@
-* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus @LisaDelaney
+* @amd-aakash @jlgreathouse @samjwu @ROCm/rocm-documentation
 # Documentation files
 docs/* @ROCm/rocm-documentation
 *.md @ROCm/rocm-documentation
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,16 +3,19 @@

 version: 2

-sphinx:
-   configuration: docs/conf.py
-
-formats: [htmlzip, pdf]
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.10"
+   apt_packages:
+     - "doxygen"
+     - "graphviz" # For dot graphs in doxygen

 python:
   install:
   - requirements: docs/sphinx/requirements.txt

-build:
-   os: ubuntu-20.04
-   tools:
-      python: "3.8"
+sphinx:
+   configuration: docs/conf.py
+
+formats: []
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -18,6 +18,7 @@ APU
 ASIC
 ASICs
 ASan
+ASAN
 ASm
 ATI
 AddressSanitizer
@@ -168,6 +169,7 @@ LLM
 LLMs
 LLVM
 LM
+LSan
 LSAN
 LTS
 LoRA
@@ -356,6 +358,7 @@ VSkipped
 Vanhoucke
 Vulkan
 WGP
+WGPs
 WX
 WikiText
 Wojna
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -19,6 +19,49 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.

+## Getting the ROCm Source Code
+
+AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
+
+### Installing the repo tool
+
+The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
+
+```bash
+mkdir -p ~/bin/
+curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
+chmod a+x ~/bin/repo
+```
+
+**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
+
+### Installing git-lfs
+
+Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
+
+```bash
+sudo apt-get install git-lfs
+```
+
+### Downloading the ROCm source code
+
+The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
+
+```bash
+mkdir -p ~/ROCm/
+cd ~/ROCm/
+~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.0.x
+~/bin/repo sync
+```
+
+**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
+
+### Building the ROCm source code
+
+Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
+
+Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
+
 ## ROCm documentation

 This repository contains the [manifest file](https://gerrit.googlesource.com/git-repo/+/HEAD/docs/manifest-format.md)
@@ -32,16 +75,14 @@ Source code for our documentation is located in the `/docs` folder of most ROCm

 The ROCm documentation homepage is [rocm.docs.amd.com](https://rocm.docs.amd.com).

-### Building our documentation
+### Building the documentation

 For a quick-start build, use the following code. For more options and detail, refer to
 [Building documentation](./docs/contribute/building.md).

 ```bash
 cd docs
-
 pip3 install -r sphinx/requirements.txt
-
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```

@@ -49,7 +90,6 @@ Alternatively, CMake build is supported.

 ```bash
 cmake -B build
-
 cmake --build build --target=doc
 ```

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,4 @@
-# Release notes
+# ROCm 6.1 release highlights
 <!-- Disable lints since this is an auto-generated file.    -->
 <!-- markdownlint-disable blanks-around-headers             -->
 <!-- markdownlint-disable no-duplicate-header               -->
@@ -8,47 +8,245 @@

 <!-- spellcheck-disable -->

-This page contains the release notes for AMD ROCm Software.
+The ROCm™ 6.1 release consists of new features and fixes to improve the stability and
+performance of AMD Instinct™ MI300 GPU applications. Notably, we've added:

-------------------
+* Full support for Ubuntu 22.04.4.

-## ROCm 6.0.2
+* **rocDecode**, a new ROCm component that provides high-performance video decode support for
+  AMD GPUs. With rocDecode, you can decode compressed video streams while keeping the resulting
+  YUV frames in video memory. With decoded frames in video memory, you can run video
+  post-processing using ROCm HIP, avoiding unnecessary data copies via the PCIe bus.

-The ROCm 6.0.2 point release consists of minor bug fixes to improve the stability of MI300 GPU applications. This release introduces several new driver features for system qualification on our partner server offerings. 
+  To learn more, refer to the rocDecode 
+  [documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).

-### Library changes in ROCm 6.0.2
+## OS and GPU support changes

-| Library | Version |
-|---------|---------|
-| AMDMIGraphX |  ⇒ [2.8](https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.0.2) |
-| hipBLAS |  ⇒ [2.0.0](https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.0.2) |
-| hipBLASLt |  ⇒ [0.6.0](https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.0.2) |
-| hipCUB |  ⇒ [3.0.0](https://github.com/ROCm/hipCUB/releases/tag/rocm-6.0.2) |
-| hipFFT |  ⇒ [1.0.13](https://github.com/ROCm/hipFFT/releases/tag/rocm-6.0.2) |
-| hipRAND |  ⇒ [2.10.17](https://github.com/ROCm/hipRAND/releases/tag/rocm-6.0.2) |
-| hipSOLVER |  ⇒ [2.0.0](https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.0.2) |
-| hipSPARSE |  ⇒ [3.0.0](https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.0.2) |
-| hipSPARSELt |  ⇒ [0.1.0](https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.0.2) |
-| hipTensor |  ⇒ [1.1.0](https://github.com/ROCm/hipTensor/releases/tag/rocm-6.0.2) |
-| MIOpen |  ⇒ [2.19.0](https://github.com/ROCm/MIOpen/releases/tag/rocm-6.0.2) |
-| rccl |  ⇒ [2.15.5](https://github.com/ROCm/rccl/releases/tag/rocm-6.0.2) |
-| rocALUTION |  ⇒ [3.0.3](https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.0.2) |
-| rocBLAS |  ⇒ [4.0.0](https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.0.2) |
-| rocFFT |  ⇒ [1.0.25](https://github.com/ROCm/rocFFT/releases/tag/rocm-6.0.2) |
-| rocm-cmake |  ⇒ [0.11.0](https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.0.2) |
-| rocPRIM |  ⇒ [3.0.0](https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.0.2) |
-| rocRAND |  ⇒ [3.0.0](https://github.com/ROCm/rocRAND/releases/tag/rocm-6.0.2) |
-| rocSOLVER |  ⇒ [3.24.0](https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.0.2) |
-| rocSPARSE |  ⇒ [3.0.2](https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.0.2) |
-| rocThrust |  ⇒ [3.0.0](https://github.com/ROCm/rocThrust/releases/tag/rocm-6.0.2) |
-| rocWMMA |  ⇒ [1.3.0](https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.0.2) |
-| Tensile |  ⇒ [4.39.0](https://github.com/ROCm/Tensile/releases/tag/rocm-6.0.2) |
+ROCm 6.1 adds the following operating system support:

-#### hipFFT 1.0.13
+* MI300A: Ubuntu 22.04.4 and RHEL 9.3
+* MI300X: Ubuntu 22.04.4

-hipFFT 1.0.13 for ROCm 6.0.2
+Future releases will add additional operating systems to match the general offering. For older
+generations of supported AMD Instinct products, we’ve added Ubuntu 22.04.4 support.

-##### Changes
+```{tip}
+To view the complete list of supported GPUs and operating systems, refer to the system requirements
+page for
+[Linux](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
+and
+[Windows](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html).
+```

-* Removed the Git submodule for shared files between rocFFT and hipFFT; instead, just copy the files
- over (this should help simplify downstream builds and packaging)
+## Installation packages
+
+This release includes a new set of packages for every module (all libraries and binaries default to
+`DT_RPATH`). Package names have the suffix `rpath`; for example, the `rpath` variant of `rocminfo` is
+`rocminfo-rpath`.
+
+```{warning}
+The new `rpath` packages will conflict with the default packages; they are meant to be used only in
+environments where legacy `DT_RPATH` is the preferred form of linking (instead of `DT_RUNPATH`). We
+do **not** recommend installing both sets of packages.
+```
+
+## ROCm components
+
+The following sections highlight select component-specific changes. For additional details, refer to the
+[Changelog](https://rocm.docs.amd.com/en/develop/about/CHANGELOG.html).
+
+### AMD System Management Interface (SMI) Tool
+
+* **New monitor command for GPU metrics**.
+  Use the monitor command to customize, capture, collect, and observe GPU metrics on
+  target devices.
+
+* **Integration with E-SMI**.
+  The EPYC™ System Management Interface In-band Library is a Linux C-library that provides in-band
+  user space software APIs to monitor and control your CPU’s power, energy, performance, and other
+  system management functionality. This integration enables access to CPU metrics and telemetry
+  through the AMD SMI API and CLI tools.
+
+### Composable Kernel (CK)
+
+* **New architecture support**.
+  CK now supports to the following architectures to enable efficient image denoising on the following
+  AMD GPUs: gfx1030, gfx1100, gfx1031, gfx1101, gfx1032, gfx1102, gfx1034, gfx1103, gfx1035,
+  gfx1036
+
+* **FP8 rounding logic is replaced with stochastic rounding**.
+  Stochastic rounding mimics a more realistic data behavior and improves model convergence.
+
+### HIP
+
+* **New environment variable to enable kernel run serialization**.
+  The default `HIP_LAUNCH_BLOCKING` value is `0` (disable); which causes kernels to run as defined in
+  the queue. When set to `1` (enable), the HIP runtime serializes the kernel queue, which behaves the
+  same as `AMD_SERIALIZE_KERNEL`.
+
+### hipBLASLt
+
+* **New GemmTuning extension parameter** GemmTuning allows you to set a split-k value for each solution, which is more feasible for
+  performance tuning.
+
+### hipFFT
+
+* **New multi-GPU support for single-process transforms** Multiple GPUs can be used to perform a transform in a single process. Note that this initial
+  implementation is a functional preview.
+
+### HIPIFY
+
+* **Skipped code blocks**: Code blocks that are skipped by the preprocessor are no longer hipified under the
+  `--default-preprocessor` option. To hipify everything, despite conditional preprocessor directives
+  (`#if`, `#ifdef`, `#ifndef`, `#elif`, or `#else`), don't use the `--default-preprocessor` or `--amap` options.
+
+### hipSPARSELt
+
+* **Structured sparsity matrix support extensions**
+  Structured sparsity matrices help speed up deep-learning workloads. We now support `B` as the
+  sparse matrix and `A` as the dense matrix in Sparse Matrix-Matrix Multiplication (SPMM). Prior to this
+  release, we only supported sparse (matrix A) x dense (matrix B) matrix multiplication. Structured
+  sparsity matrices help speed up deep learning workloads.
+
+### hipTensor
+
+* **4D tensor permutation and contraction support**.
+  You can now perform tensor permutation on 4D tensors and 4D contractions for F16, BF16, and
+  Complex F32/F64 datatypes.
+
+### MIGraphX
+
+* **Improved performance for transformer-based models**.
+  We added support for FlashAttention, which benefits models like BERT, GPT, and Stable Diffusion.
+
+* **New Torch-MIGraphX driver**.
+  This driver calls MIGraphX directly from PyTorch. It provides an `mgx_module` object that you can
+  invoke like any other Torch module, but which utilizes the MIGraphX inference engine internally.
+  Torch-MIGraphX supports FP32, FP16, and INT8 datatypes.
+
+  * **FP8 support**. We now offer functional support for inference in the FP8E4M3FNUZ datatype. You
+  can load an ONNX model in FP8E4M3FNUZ using C++ or Python APIs, or `migraphx-driver`.
+  You can quantize a floating point model to FP8 format by using the `--fp8` flag with `migraphx-driver`.
+  To accelerate inference, MIGraphX uses hardware acceleration on MI300 for FP8 by leveraging FP8
+  support in various backend kernel libraries.
+
+### MIOpen
+
+* **Improved performance for inference and convolutions**.
+  Inference support now provided for Find 2.0 fusion plans. Additionally, we've enhanced the Number of
+  samples, Height, Width, and Channels (NHWC) convolution kernels for heuristics. NHWC stores data
+  in a format where the height and width dimensions come first, followed by channels.
+
+### OpenMP
+
+* **Implicit Zero-copy is triggered automatically in XNACK-enabled MI300A systems**.
+  Implicit Zero-copy behavior in `non unified_shared_memory` programs is triggered automatically in
+  XNACK-enabled MI300A systems (for example, when using the `HSA_XNACK=1` environment
+  variable). OpenMP supports the 'requires `unified_shared_memory`' directive to support programs
+  that don’t want to copy data explicitly between the CPU and GPU. However, this requires that you add
+  these directives to every translation unit of the program.
+
+* **New MI300 FP atomics**. Application performance can now improve by leveraging fast floating-point atomics on MI300 (gfx942).
+  
+
+### RCCL
+
+* **NCCL 2.18.6 compatibility**.
+  RCCL is now compatible with NCCL 2.18.6, which includes increasing the maximum IB network interfaces to 32 and fixing network device ordering when creating communicators with only one GPU
+  per node.
+
+* **Doubled simultaneous communication channels**.
+  We improved MI300X performance by increasing the maximum number of simultaneous
+  communication channels from 32 to 64.
+
+### rocALUTION
+
+* **New multiple node and GPU support**.
+  Unsmoothed and smoothed aggregations and Ruge-Stueben AMG now work with multiple nodes
+  and GPUs. For more information, refer to the 
+  [API documentation](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/usermanual/solvers.html#unsmoothed-aggregation-amg).
+
+### rocDecode
+
+* **New ROCm component**.
+  rocDecode ROCm's newest component, providing high-performance video decode support for AMD
+  GPUs. To learn more, refer to the 
+  [documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
+
+### ROCm Compiler
+
+* **Combined projects**. ROCm Device-Libs, ROCm Compiler Support, and hipCC are now located in
+  the `llvm-project/amd` subdirectory of AMD's fork of the LLVM project. Previously, these projects
+  were maintained in separate repositories. Note that the projects themselves will continue to be
+  packaged separately.
+
+* **Split the 'rocm-llvm' package**. This package has been split into a required and an optional package: 
+
+  * **rocm-llvm(required)**: A package containing the essential binaries needed for compilation.
+  
+  * **rocm-llvm-dev(optional)**: A package containing binaries for compiler and application developers.
+    
+
+### ROCm Data Center Tool (RDC)
+
+* **C++ upgrades**.
+  RDC was upgraded from C++11 to C++17 to enable a more modern C++ standard when writing RDC plugins.
+
+### ROCm Performance Primitives (RPP)
+
+* **New backend support**.
+  Audio processing support added for the `HOST` backend and 3D Voxel kernels support
+  for the `HOST` and `HIP` backends.
+
+### ROCm Validation Suite
+
+* **New datatype support**.
+Added BF16 and FP8 datatypes based on General Matrix Multiply(GEMM) operations in the GPU Stress Test (GST) module. This provides additional performance benchmarking and stress testing based on the newly supported datatypes.
+
+### rocSOLVER
+
+* **New EigenSolver routine**.
+Based on the Jacobi algorithm, a new EigenSolver routine was added to the library. This routine computes the eigenvalues and eigenvectors of a matrix with improved performance.
+
+### ROCTracer
+
+* **New versioning and callback enhancements**.
+Improved to match versioning changes in HIP Runtime and supports runtime API callbacks and activity record logging. The APIs of different runtimes at different levels are considered different API domains with assigned domain IDs.
+
+## Upcoming changes
+
+* ROCm SMI will be deprecated in a future release. We advise **migrating to AMD SMI** now to
+  prevent future workflow disruptions.
+
+* hipCC supports, by default, the following compiler invocation flags:
+
+  * `-mllvm -amdgpu-early-inline-all=true`
+  * `-mllvm -amdgpu-function-calls=false`
+
+  In a future ROCm release, hipCC will no longer support these flags. It will, instead, use the Clang
+  defaults:
+
+  * `-mllvm -amdgpu-early-inline-all=false`
+  * `-mllvm -amdgpu-function-calls=true`
+
+  To evaluate the impact of this change, include `--hipcc-func-supp` in your hipCC invocation.
+
+  For information on these flags, and the differences between hipCC and Clang, refer to
+  [ROCm Compiler Interfaces](https://rocm.docs.amd.com/en/latest/reference/rocmcc.html#rocm-compiler-interfaces).
+
+*  Future ROCm releases will not provide `clang-ocl`. For more information, refer to the
+  [`clang-ocl` README](https://github.com/ROCm/clang-ocl).
+
+* The following operating systems will be supported in a future ROCm release. They are currently
+  only available in beta.
+
+  * RHEL 9.4
+  * RHEL 8.10
+  * SLES 15 SP6
+
+* As of ROCm 6.2, we’ve planned for **end-of-support** for:
+
+  * Ubuntu 20.04.5
+  * SLES 15 SP4
+  * RHEL/CentOS 7.9
--- a/default.xml
+++ b/default.xml
@@ -2,69 +2,68 @@
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
    <remote name="KhronosGroup" fetch="https://github.com/KhronosGroup/" />
-    <default revision="refs/tags/rocm-6.0.2"
+    <default revision="refs/tags/rocm-6.1.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
 <!--list of projects for ROCm-->
+    <project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" />
    <project name="ROCK-Kernel-Driver" />
-    <project name="ROCT-Thunk-Interface" />
    <project name="ROCR-Runtime" />
+    <project name="ROCT-Thunk-Interface" />
    <project name="amdsmi" />
-    <project name="rocm_smi_lib" />
-    <project name="rocm-core" />
-    <project name="rocm-cmake" />
-    <project name="rocminfo" />
-    <project name="rocm_bandwidth_test" />
-    <project name="rocprofiler" />
-    <project name="roctracer" />
-    <project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" revision="6c03f8b58fafd9dd693eaac826749a5cfad515f8" />
    <project name="clang-ocl" />
    <project name="rdc" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="roctracer" />
 <!--HIP Projects-->
    <project name="HIP" />
    <project name="HIP-Examples" />
+    <project name="HIPIFY" />
    <project name="clr" />
    <project name="hipother" />
-    <project name="HIPIFY" />
-    <project name="HIPCC" />
 <!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
    <project name="llvm-project" />
-    <project name="ROCm-Device-Libs" />
-    <project name="ROCm-CompilerSupport" />
-    <project name="half" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
 <!-- gdb projects -->
-    <project name="ROCgdb" />
    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
-    <project groups="mathlibs" name="rocBLAS" />
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIOpen" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
    <project groups="mathlibs" name="Tensile" />
-    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="composable_kernel" />
    <project groups="mathlibs" name="hipBLAS" />
    <project groups="mathlibs" name="hipBLASLt" />
-    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="hipCUB" />
    <project groups="mathlibs" name="hipFFT" />
-    <project groups="mathlibs" name="rocRAND" />
    <project groups="mathlibs" name="hipRAND" />
-    <project groups="mathlibs" name="rocSPARSE" />
-    <project groups="mathlibs" name="hipSPARSELt" />
-    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="hipSOLVER" />
    <project groups="mathlibs" name="hipSPARSE" />
-    <project groups="mathlibs" name="rocALUTION" />
-    <project groups="mathlibs" name="rocThrust" />
-    <project groups="mathlibs" name="hipCUB" />
-    <project groups="mathlibs" name="rocPRIM" />
-    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="hipSPARSELt" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
-    <project name="MIOpen" />
-    <project name="composable_kernel" />
-    <project name="MIVisionX" />
-    <project name="rpp" />
-    <project name="hipfort" />
-    <project name="AMDMIGraphX" />
-    <project name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocBLAS" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="rocPRIM" />
+    <project groups="mathlibs" name="rocRAND" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocSPARSE" />
+    <project groups="mathlibs" name="rocThrust" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
 <!-- Projects for OpenMP-Extras -->
    <project name="aomp" path="openmp-extras/aomp" />
    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -17,7 +17,7 @@ following section.

 ## ROCm component licenses

-ROCm is released by Advanced Micro Devices, Inc. and is licensed per component separately.
+ROCm is released by Advanced Micro Devices, Inc. (AMD) and is licensed per component separately.
 The following table is a list of ROCm components with links to their respective license
 terms. These components may include third party components subject to
 additional licenses. Please review individual repositories for more information.
@@ -25,66 +25,71 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [HIPCC](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) |
-| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
 | [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
-| [MIOpenGEMM](https://github.com/ROCm/MIOpenGEMM/) | [MIT](https://github.com/ROCm/MIOpenGEMM/blob/master/LICENSE.txt) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/master/LICENSE.txt) |
-| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/master/LICENSE.txt) |
-| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
-| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
-| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
-| [ROCclr](https://github.com/ROCm/ROCclr/) | [MIT](https://github.com/ROCm/ROCclr/blob/develop/LICENSE.txt) |
-| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-master/LICENSE.txt) |
-| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
-| [ROCm-CompilerSupport](https://github.com/ROCm/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
-| [ROCm-Device-Libs](https://github.com/ROCm/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
-| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
-| [ROCm-OpenCL-Runtime](https://github.com/ROCm/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/ROCm/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
-| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
+| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
+| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
+| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
+| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
+| [AMD Common Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/develop/LICENCE) |
+| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
+| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
 | [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
-| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
-| [atmi](https://github.com/ROCm/atmi/) | [MIT](https://github.com/ROCm/atmi/blob/master/LICENSE.txt) |
+| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
+| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
+| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
+| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
+| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
 | [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
-| [flang](https://github.com/ROCm/flang/) | [Apache 2.0](https://github.com/ROCm/flang/blob/master/LICENSE.txt) |
-| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/master/LICENSE.txt) |
+| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
+| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
+| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
+| [ROCR Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
+| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
+| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
 | [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
+| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
 | [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
 | [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
+| [hipFORT](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
+| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
 | [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
-| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
 | [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
+| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
 | [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
-| [hipamd](https://github.com/ROCm/hipamd/) | [MIT](https://github.com/ROCm/hipamd/blob/develop/LICENSE.txt) |
-| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/master/LICENSE) |
-| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/main/LICENSE.TXT) |
-| [rccl](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
-| [rdc](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/master/LICENSE) |
+| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
 | [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
+| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
 | [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
+| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
 | [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
 | [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
 | [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
 | [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
-| [rocm-cmake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
-| [rocm_bandwidth_test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
-| [rocm_smi_lib](https://github.com/ROCm/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_smi_lib/blob/master/License.txt) |
-| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/master/License.txt) |
-| [rocprofiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
-| [rocr_debug_agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/master/LICENSE.txt) |
-| [roctracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
-| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)
+| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
+| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
+| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
+| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
+| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
+| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
+| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
+| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
+| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
+| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html)

 Open sourced ROCm components are released via public GitHub
-repositories, packages on https://repo.radeon.com and other distribution channels.
-Proprietary products are only available on https://repo.radeon.com. Currently, only
-one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
+repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
+Proprietary products are only available on [https://repo.radeon.com](https://repo.radeon.com). Currently, only
+one component of ROCm, `rocm-llvm-alt` is governed by a proprietary license.
 Proprietary components are organized in a proprietary subdirectory in the package
 repositories to distinguish from open sourced packages.

@@ -92,7 +97,7 @@ repositories to distinguish from open sourced packages.
 The following additional terms and conditions apply to your use of ROCm technical documentation.
 ```

-©2023 Advanced Micro Devices, Inc. All rights reserved.
+©2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.

 The information presented in this document is for informational purposes only
 and may contain technical inaccuracies, omissions, and typographical errors. The
@@ -125,8 +130,8 @@ companies.

 :::{attention}
 AQL Profiler and AOCC CPU optimization are both provided in binary form, each
-subject to the license agreement enclosed in the directory for the binary and is
-available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
+subject to the license agreement enclosed in the directory for the binary available
+in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
 copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
 the terms and conditions of this license agreement. If you do not agree to the
 terms of this agreement, do not install, copy or use the AQL Profiler and/or the
@@ -134,9 +139,8 @@ AOCC CPU Optimizations.
 :::

 For the rest of the ROCm packages, you can find the licensing information at the
-following location: `/opt/rocm/share/doc/<component-name>/`
+following location: `/opt/rocm/share/doc/<component-name>/` or in the locations
+specified in the preceding table.

-For example, you can fetch the licensing information of the `_amd_comgr_`
-component (Code Object Manager) from the `amd_comgr` folder. A file named
-`LICENSE.txt` contains the license details at:
-`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`
+For example, you can fetch the licensing information of the `amd_comgr`
+component (Code Object Manager) from the `/opt/rocm/share/doc/amd_comgr/LICENSE.txt` file.
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -0,0 +1,127 @@
+.. meta::
+    :description: ROCm compatibility matrix
+    :keywords: AMD, GPU, architecture, hardware, compatibility, requirements
+
+**************************************************************************************
+Compatibility matrix
+**************************************************************************************
+
+Use this matrix to view the ROCm compatibility across successive major and minor releases.
+
+
+.. container:: format-big-table
+
+  .. csv-table:: 
+      :header: "ROCm Version", "6.1.0", "6.0.0"
+      :stub-columns: 1
+
+      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
+      ,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,CentOS 7.9,CentOS 7.9
+      ,,
+      :doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3
+      ,CDNA2,CDNA2
+      ,CDNA,CDNA
+      ,RDNA3,RDNA3
+      ,RDNA2,RDNA2
+      ,,
+      :doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100
+      ,gfx1030,gfx1030
+      ,gfx942 [#]_, gfx942 [#]_
+      ,gfx90a,gfx90a
+      ,gfx908,gfx908
+      ,,
+      ECOSYSTEM SUPPORT:,,
+      :doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.15, 2.14, 2.13","2.14, 2.13, 2.12"
+      :doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.14.1
+      ,,
+      3RD PARTY COMMUNICATION LIBS:,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1
+      ,,
+      3RD PARTY ALGORITHM LIBS:,,
+      Thrust,2.1.0,2.0.1
+      CUB,2.1.0,2.0.1
+      ,,
+      ML & COMPUTER VISION LIBS:,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.1.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,2.5.0,2.5.0
+      :doc:`rocDecode <rocdecode:index>`,0.5.0,N/A
+      :doc:`ROCm Performance Primitives (RPP) <rpp:index>`,1.5.0,1.4.0
+      ,,
+      COMMUNICATION:,,
+      :doc:`RCCL <rccl:index>`,2.18.6,2.18.3
+      ,,
+      MATH LIBS:,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.1.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.13
+      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.1.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.1.1,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.1.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.27,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.0.1,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.4.0,1.3.0
+      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.39.0
+      ,,
+      PRIMITIVES:,,
+      :doc:`hipCUB <hipcub:index>`,3.1.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.2.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.1.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.0
+      ,,
+      SUPPORT LIBS:,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.1.40091,6.0.32830
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.12.0,0.11.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.0,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.3.30,20231016.2.245
+      ,,
+      TOOLS:,,
+      :doc:`AMD SMI <amdsmi:index>`,24.4.1,23.4.2
+      :doc:`HIPIFY <hipify:index>`,17.0.0,17.0.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60100,2.0.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60100,4.1.0
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.1.0,13.2.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.0.0,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.0,rocm-6.0.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3
+      :doc:`TransferBench <transferbench:index>`,1.48,1.46
+      ,,
+      COMPILERS:,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0
+      `Flang <https://github.com/ROCm/flang>`_,17.0.0.24103,17.0.0.23483
+      `llvm-project <https://github.com/ROCm/llvm-project>`_,17.0.0.24103,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24103,17.0.0.23483
+      ,,
+      RUNTIMES:,,
+      :doc:`HIP <hip:index>`,6.1.40091,6.0.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0
+      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.12.0
+
+
+.. rubric:: Footnotes
+
+.. [#red-hat94] **For ROCm 6.1** - RHEL 9.4 is supported only on AMD Instinct MI300A.
+.. [#] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+.. [#] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+
+
--- a/docs/about/compatibility/data-type-support.rst
+++ b/docs/about/compatibility/data-type-support.rst
@@ -1,17 +1,18 @@
 .. meta::
  :description: Supported data types in ROCm
-  :keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float, float32, float64, double, AMD, ROCm, AMDGPU
-
-.. _rocm-supported-data-types:
+  :keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float,
+   float32, float64, double, AMD, ROCm, AMDGPU

 *************************************************************
-ROCm data type specifications
+Precision support
 *************************************************************

+Use the following sections to identify data types and HIP types ROCm™ supports.
+
 Integral types
 ==========================================

-The signed and unsigned integral types that are supported by ROCm™ are listed in the following table,
+The signed and unsigned integral types that are supported by ROCm are listed in the following table,
 together with their corresponding HIP type and a short description.


@@ -46,7 +47,7 @@ Floating-point types
 The floating-point types that are supported by ROCm are listed in the following table, together with
 their corresponding HIP type and a short description.

-.. image:: ../../data/about/compatibility/floating-point-data-types.png
+.. image:: ../data/about/compatibility/floating-point-data-types.png
    :alt: Supported floating-point types

 .. list-table::
@@ -403,37 +404,37 @@ description, refer to the corresponding library data type support page.
        - int32
        - int64
      *
-        - hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
        - ✅/✅
        - ❌/❌
        - ❌/❌
        - ❌/❌
      *
-        - rocRAND (:doc:`details<rocrand:data-type-support>`)
+        - rocRAND (:doc:`details <rocrand:data-type-support>`)
        - -/✅
        - -/✅
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details<hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
        - -/✅
        - -/✅
        - -/✅
        - -/✅
      *
-        - rocPRIM (:doc:`details<rocprim:data-type-support>`)
+        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details<hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
        - ✅/✅
      *
-        - rocThrust (:doc:`details<rocthrust:data-type-support>`)
+        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
        - ✅/✅
        - ✅/✅
        - ✅/✅
@@ -455,7 +456,7 @@ description, refer to the corresponding library data type support page.
        - float32
        - float64
      *
-        - hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -464,7 +465,7 @@ description, refer to the corresponding library data type support page.
        - ❌/❌
        - ❌/❌
      *
-        - rocRAND (:doc:`details<rocrand:data-type-support>`)
+        - rocRAND (:doc:`details <rocrand:data-type-support>`)
        - -/❌
        - -/❌
        - -/✅
@@ -473,7 +474,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - hipRAND (:doc:`details<hiprand:data-type-support>`)
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
        - -/❌
        - -/❌
        - -/✅
@@ -482,7 +483,7 @@ description, refer to the corresponding library data type support page.
        - -/✅
        - -/✅
      *
-        - rocPRIM (:doc:`details<rocprim:data-type-support>`)
+        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -491,7 +492,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - hipCUB (:doc:`details<hipcub:data-type-support>`)
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ✅/✅
@@ -500,7 +501,7 @@ description, refer to the corresponding library data type support page.
        - ✅/✅
        - ✅/✅
      *
-        - rocThrust (:doc:`details<rocthrust:data-type-support>`)
+        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
        - ❌/❌
        - ❌/❌
        - ⚠️/⚠️
@@ -531,7 +532,7 @@ description, refer to the corresponding library data type support page.
        - int32
        - int64
      *
-        - hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
        - ❌
        - ❌
        - ✅
@@ -554,7 +555,7 @@ description, refer to the corresponding library data type support page.
        - float32
        - float64
      *
-        - hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
        - ❌
        - ❌
        - ❌
--- a/docs/conceptual/compiler-topics.md
+++ b/docs/conceptual/compiler-topics.md
@@ -0,0 +1,14 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD ROCm documentation">
+  <meta name="keywords" content="documentation, guides, installation, compatibility, support,
+  reference, ROCm, AMD">
+</head>
+
+# Using compiler features
+
+The following topics describe using specific features of the compilation tools:
+
+* [Using AddressSanitizer](./using-gpu-sanitizer.md)
+* [Compiler disambiguation](./compiler-disambiguation.md)
+* [OpenMP support in ROCm](../about/compatibility/openmp.md)
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -33,8 +33,8 @@ Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
 subdivided into four SIMD units that process SIMD instructions of 16 data
 elements per instruction (for the FP64 data type). This enables the CU to
 process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
-GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
-TFLOPS for vector instructions. The MI250 compute units also provide specialized
+GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
+TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
 execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -5,13 +5,13 @@
  libraries, instrumented applications, AMD, ROCm">
 </head>

-# Using the LLVM ASan on a GPU (beta release)
+# Using the AddressSanitizer on a GPU (beta release)

 The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.

 Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
-
 This document provides documentation on using ROCm ASan.
+
 For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).

 :::{note}
@@ -26,17 +26,28 @@ Recommendations for doing this are:

 * Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
 * Add the following options to the existing compiler and linker options:
+  
  * `-fsanitize=address` - enables instrumentation
-  * `-shared-libsan` - use shared version of runtime
-  * `-g` - add debug info for improved reporting
-* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
-Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.

+  * `-shared-libsan` - use shared version of runtime
+
+  * `-g` - add debug info for improved reporting
+
+* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
+
+Other architectures are allowed, but their device code will not be instrumented, and a warning will be issued.
+
+:::{tip}
 It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
+:::
+
+:::{note}
+When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
+:::

 ### About compilation time

-When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all of the downstream components of the compiler toolchain and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances raised the compile time to an unacceptable level.
+When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all downstream components of the compiler toolchain, and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances increased compile time to an unacceptable level.

 There are a few options if the compile time becomes unacceptable:

@@ -56,9 +67,9 @@ For a complete ROCm GPU Sanitizer installation, including packages, instrumented
 ## Using AMD-supplied ASan instrumented libraries

 ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
-These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with g++. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
+These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with `g++`. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.

-These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
+These instrumented libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.

 When adjusting an application build to add instrumentation, linking against these instrumented libraries is unnecessary. For example, any `-L` `/opt/rocm-<version>/lib` compiler options need not be changed. However, the instrumented libraries should be used when the application is run. It is particularly important that the instrumented language runtimes, like `libamdhip64.so` and `librocm-core.so`, are used; otherwise, device invalid access detections may not be reported.

@@ -86,16 +97,25 @@ If it does not appear, when executed the application will quickly output an ASan

 * Ensure that the application `llvm-symbolizer` can be executed, and that it is located in `/opt/rocm-<version>/llvm/bin`. This executable is not strictly required, but if found is used to translate ("symbolize") a host-side instruction address into a more useful function name, file name, and line number (assuming the application has been built to include debug information).

-There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASAN runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASAN runtime. The device runtime only currently supports the default settings for the few relevant options.
+There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.

-There are two `ASAN_OPTION` flags of particular note.
+There are three `ASAN_OPTION` flags of note.

 * `halt_on_error=0/1 default 1`.

-This tells the ASAN runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
+  This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.

 * `detect_leaks=0/1 default 1`.
-This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSAN). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSAN suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
+
+  This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
+
+* `quarantine_size_mb=N default 256`
+
+  This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
+
+  :::{note}
+  Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
+  :::

 ## Runtime overhead

@@ -110,11 +130,12 @@ before the address is actually accessed by a load, store, or atomic
 instruction.
 This checking involves an additional load to "shadow" memory which
 records whether the address is "poisoned" or not, and additional logic
-that decides whether to produce an detection report or not.
+that decides whether to produce a detection report or not.

 This extra runtime work can cause the application to slow down by
 a factor of three or more, depending on how many memory accesses are
 executed.
+
 For heterogeneous applications, the shadow memory must be accessible by all devices
 and this can mean that shadow accesses from some devices may be more costly
 than non-shadow accesses.
@@ -134,7 +155,7 @@ instrumentation.

 ## Runtime reporting

-It is not the intention of this document to provide a detailed explanation of all of the types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.
+It is not the intention of this document to provide a detailed explanation of all types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.

 An invalid address detection report for the CPU always starts with

@@ -181,7 +202,7 @@ or

 currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.

-### Running with `rocgdb`
+## Running ASan with `rocgdb`

 `rocgdb` can be used to further investigate ASan detected errors, with some preparation.

@@ -198,7 +219,7 @@ This is solved by setting environment variable `LD_PRELOAD` to the path to the A
 amdclang++ -print-file-name=libclang_rt.asan-x86_64.so
 ```

-It is also recommended to set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.
+You should also set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.

 After starting `rocgdb` breakpoints can be set on the ASan runtime error reporting entry points of interest. For example, if an ASan error report includes

@@ -233,18 +254,180 @@ $ rocgdb <path to application>
 (gdb) c
 ```

-### Using ASan with a short HIP application
+## Using ASan with a short HIP application

-Refer to the following example to use ASan with a short HIP application,
+Consider the following simple and short demo of using the Address Sanitizer with a HIP application:

-https://github.com/Rmalavally/rocm-examples/blob/Rmalavally-patch-1/LLVM_ASAN/Using-Address-Sanitizer-with-a-Short-HIP-Application.md
+```C++

-### Known issues with using GPU sanitizer
+#include <cstdlib>
+#include <hip/hip_runtime.h>

-* Red zones must have limited size and it is possible for an invalid access to completely miss a red zone and not be detected.
+__global__ void
+set1(int *p)
+{
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+    p[i] = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    int m = std::atoi(argv[1]);
+    int n1 = std::atoi(argv[2]);
+    int n2 = std::atoi(argv[3]);
+    int c = std::atoi(argv[4]);
+    int *dp;
+    hipMalloc(&dp, m*sizeof(int));
+    hipLaunchKernelGGL(set1, dim3(n1), dim3(n2), 0, 0, dp);
+    int *hp = (int*)malloc(c * sizeof(int));
+    hipMemcpy(hp, dp, m*sizeof(int), hipMemcpyDeviceToHost);
+    hipDeviceSynchronize();
+    hipFree(dp);
+    free(hp);
+    std::puts("Done.");
+    return 0;
+}
+```
+
+This application will attempt to access invalid addresses for certain command line arguments. In particular, if `m < n1 * n2` some device threads will attempt to access
+unallocated device memory.
+
+Or, if `c < m`, the `hipMemcpy` function will copy past the end of the `malloc` allocated memory.
+
+**Note**: The `hipcc` compiler is used here for simplicity.
+
+Compiling without XNACK results in a warning.
+
+```bash
+$ hipcc -g --offload-arch=gfx90a:xnack- -fsanitize=address -shared-libsan mini.hip -o mini
+clang++: warning: ignoring` `-fsanitize=address' option for offload arch 'gfx90a:xnack-`, as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored]`.
+```
+
+The binary compiled above will run, but the GPU code will not be instrumented and the `m < n1 * n2` error will not be detected. Switching to `--offload-arch=gfx90a:xnack+` in the command above results in a warning-free compilation and an instrumented application. After setting `PATH`, `LD_LIBRARY_PATH` and `HSA_XNACK` as described earlier, a check of the binary with `ldd` yields the following,
+
+```bash
+$ ldd mini
+        linux-vdso.so.1 (0x00007ffd1a5ae000)
+        libclang_rt.asan-x86_64.so => /opt/rocm-6.1.0-99999/llvm/lib/clang/17.0.0/lib/linux/libclang_rt.asan-x86_64.so (0x00007fb9c14b6000)
+        libamdhip64.so.5 => /opt/rocm-6.1.0-99999/lib/asan/libamdhip64.so.5 (0x00007fb9bedd3000)
+        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fb9beba8000)
+        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fb9bea59000)
+        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fb9bea3e000)
+        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fb9be84a000)
+        libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fb9be844000)
+        libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fb9be821000)
+        librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fb9be817000)
+        libamd_comgr.so.2 => /opt/rocm-6.1.0-99999/lib/asan/libamd_comgr.so.2 (0x00007fb9b4382000)
+        libhsa-runtime64.so.1 => /opt/rocm-6.1.0-99999/lib/asan/libhsa-runtime64.so.1 (0x00007fb9b3b00000)
+        libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fb9b3af3000)
+        /lib64/ld-linux-x86-64.so.2 (0x00007fb9c2027000)
+        libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fb9b3ad7000)
+        libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fb9b3aa7000)
+        libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fb9b3a89000)
+        libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fb9b3a70000)
+        libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fb9b3a62000)
+
+```
+
+This confirms that the address sanitizer runtime is linked in, and the ASan instrumented version of the runtime libraries are used.
+Checking the `PATH` yields
+
+```bash
+$ which llvm-symbolizer
+/opt/rocm-6.1.0-99999/llvm/bin/llvm-symbolizer
+```
+
+Lastly, a check of the OS kernel version yields
+
+```bash
+$ uname -rv
+5.15.0-73-generic #80~20.04.1-Ubuntu SMP Wed May 17 14:58:14 UTC 2023
+```
+
+which indicates that the required HMM support (kernel version > 5.6) is available. This completes the necessary setup. Running with `m = 100`, `n1 = 11`, `n2 = 10` and `c = 100` should produce
+a report for an invalid access by the last 10 threads.
+
+```bash
+=================================================================
+==3141==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fb1410d2cc4
+WRITE of size 4 in workgroup id (10,0,0)
+  #0 0x7fb1410d2cc4 in set1(int*) at /home/dave/mini/mini.cpp:0:10
+
+Thread ids and accessed addresses:
+00 : 0x7fb14371d190 01 : 0x7fb14371d194 02 : 0x7fb14371d198 03 : 0x7fb14371d19c 04 : 0x7fb14371d1a0 05 : 0x7fb14371d1a4 06 : 0x7fb14371d1a8 07 : 0x7fb14371d1ac
+08 : 0x7fb14371d1b0 09 : 0x7fb14371d1b4
+
+0x7fb14371d190 is located 0 bytes after 400-byte region [0x7fb14371d000,0x7fb14371d190)
+allocated by thread T0 here:
+    #0 0x7fb151c76828 in hsa_amd_memory_pool_allocate /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:692:3
+    #1 ...
+
+    #12 0x7fb14fb99ec4 in hipMalloc /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:568:3
+    #13 0x226630 in hipError_t hipMalloc<int>(int**, unsigned long) /opt/rocm-6.1.0-99999/include/hip/hip_runtime_api.h:8367:12
+    #14 0x226630 in main /home/dave/mini/mini.cpp:19:5
+    #15 0x7fb14ef02082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+Shadow bytes around the buggy address:
+  0x7fb14371cf00: ...
+
+=>0x7fb14371d180: 00 00[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa
+  0x7fb14371d200: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==3141==ABORTING
+```
+
+Running with `m = 100`, `n1 = 10`, `n2 = 10` and `c = 99` should produce a report for an invalid copy.
+
+```shell
+=================================================================
+==2817==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x514000150dcc at pc 0x7f5509551aca bp 0x7ffc90a7ae50 sp 0x7ffc90a7a610
+WRITE of size 400 at 0x514000150dcc thread T0
+    #0 0x7f5509551ac9 in __asan_memcpy /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3
+    #1 ...
+
+    #9 0x7f5507462a28 in hipMemcpy_common(void*, void const*, unsigned long, hipMemcpyKind, ihipStream_t*) /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:637:10
+    #10 0x7f5507464205 in hipMemcpy /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:642:3
+    #11 0x226844 in main /home/dave/mini/mini.cpp:22:5
+    #12 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+    #13 0x22605d in _start (/home/dave/mini/mini+0x22605d)
+
+0x514000150dcc is located 0 bytes after 396-byte region [0x514000150c40,0x514000150dcc)
+allocated by thread T0 here:
+    #0 0x7f5509553dcf in malloc /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
+    #1 0x226817 in main /home/dave/mini/mini.cpp:21:21
+    #2 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+SUMMARY: AddressSanitizer: heap-buffer-overflow /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3 in __asan_memcpy
+Shadow bytes around the buggy address:
+  0x514000150b00: ...
+
+=>0x514000150d80: 00 00 00 00 00 00 00 00 00[04]fa fa fa fa fa fa
+  0x514000150e00: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==2817==ABORTING
+```
+
+## Known issues with using GPU sanitizer
+
+* Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.

 * Lack of detection or false reports can be caused by the runtime not properly maintaining red zone shadows.

 * Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.

-* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside of any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+* It can also be the case that a memory fault is reported for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+
+* There is currently a bug which can result in memory faults being reported when running instrumented device code which makes use of `malloc`, `free`, `new`, or `delete`.
+
+* There is currently a bug which can result in undefined symbols being reported at compile time when instrumented device code makes use of `new` and `delete`.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -38,8 +38,8 @@ latex_elements = {
 project = "ROCm Documentation"
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.0.1"
-release = "6.0.1"
+version = "6.1.0"
+release = "6.1.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""
@@ -49,12 +49,12 @@ article_pages = [
    {
        "file":"about/release-notes",
        "os":["linux", "windows"],
-        "date":"2024-01-31"
+        "date":"2024-04-16"
    },
    {
        "file":"about/CHANGELOG",
        "os":["linux", "windows"],
-        "date":"2024-01-31"
+        "date":"2024-04-16"
    },

    {"file":"install/windows/install-quick", "os":["windows"]},
@@ -84,22 +84,47 @@ article_pages = [
    {"file":"how-to/system-debugging", "os":["linux"]},
    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},

-    {"file":"rocm-a-z", "os":["linux", "windows"]},
+    {"file":"how-to/rocm-for-ai/index", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/install", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/train-a-model", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/deploy-your-model", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/hugging-face-models", "os":["linux"]},
+
+    {"file":"how-to/rocm-for-hpc/index", "os":["linux"]},
+
+    {"file":"how-to/llm-fine-tuning-optimization/index", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/overview", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/llm-inference-frameworks", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/model-acceleration-libraries", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/model-quantization", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/optimizing-triton-kernel", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/profiling-and-debugging", "os":["linux"]},
 ]

 exclude_patterns = ['temp']

 external_toc_path = "./sphinx/_toc.yml"

-extensions = ["rocm_docs"]
+extensions = ["rocm_docs", "sphinx_reredirects"]

 external_projects_current_project = "rocm"

 html_theme = "rocm_docs_theme"
 html_theme_options = {"flavor": "rocm-docs-home"}

+html_static_path = ["sphinx/static/css"]
+html_css_files = ["rocm_custom.css"]
+
 html_title = "ROCm Documentation"

 html_theme_options = {
    "link_main_doc": False
 }
+
+redirects = {
+     "reference/openmp/openmp": "../../about/compatibility/openmp.html"
+}
--- a/docs/contribute/contributing.md
+++ b/docs/contribute/contributing.md
@@ -36,7 +36,7 @@ To make edits to our documentation via PR, follow these steps:
      git clone git@github.com:ROCm/ROCm.git
      ```

-   * Add your fork to this local copy of the repository. Run:
+   * Optionally add your fork to this local copy of the repository by running:

      ```bash
      git add remote <name-of-my-fork> <git@github.com:my-username/ROCm.git>
@@ -45,43 +45,33 @@ To make edits to our documentation via PR, follow these steps:
      To get the URL of your fork, go to your GitHub profile, select the fork and click the green 'Code'
      button (the same process you followed to get the main GitHub repository URL).

-4. Check out the **develop** branch and run 'git pull' (and/or 'git pull origin develop' to ensure your
-  local version has the most recent content.
+4. Change directory into your local copy of the repository, and run ``git pull`` (or ``git pull origin develop``) to ensure your local copy has the most recent content.

-5. Create a new branch.
+5. Create and checkout a new branch using the following command:

    ```bash
-    git checkout -b my-new-branch
+    git checkout -b <branch_name>
    ```

-6. Make your changes locally using your preferred code editor. Follow the guidelines listed on the
+6. Change directory into the `./docs` folder and make any documentation changes locally using your preferred code editor. Follow the guidelines listed on the
   [documentation structure](./doc-structure.md) page.

-7. (optional) We recommend running a local test build to ensure the content looks the way you expect.
-
-    In your terminal, run the following commands from within your cloned repository:
+7. Optionally run a local test build of the documentation to ensure the content builds and looks as expected. In your terminal, run the following commands from within the `./docs` folder of your cloned repository:

     ```bash
-     cd docs/   # The other commands are run from within the ./docs folder
-     
     pip3 install -r sphinx/requirements.txt  # You only need to run this command once
-     
     python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
     ```

-    The build files are located in the `docs/_build` folder. To preview your build, open the index file
-    (`docs/_build/html/index.html`) file. For more information, see
-    [Building documentation](building.md). To learn
-    more about our build tools, see
-    [Documentation toolchain](toolchain.md).
+    The build output files are located in the `docs/_build` folder. To preview your build, open the index file
+    (`docs/_build/html/index.html`) file. For more information, see [Building documentation](building.md). To learn
+    more about our build tools, see [Documentation toolchain](toolchain.md).

-8. Commit your changes and push them to GitHub. Run:
+8. Commit your changes and push them to GitHub by running:

    ```bash
    git add <path-to-my-modified-file> # To add all modified files, you can use: git add .
-
    git commit -m "my-updates"
-
    git push <name-of-my-fork>
    ```

--- a/docs/contribute/feedback.md
+++ b/docs/contribute/feedback.md
@@ -12,8 +12,7 @@ There are four standard ways to provide feedback on this repository.

 All contributions to ROCm documentation should arrive via the
 [GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
-targeting the develop branch of the repository. If you are unable to contribute
-via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
+targeting the develop branch of the repository.

 For more in-depth information on creating a pull request (PR), see
 [Contributing](./contributing.md).
@@ -30,7 +29,3 @@ and follow along on via public announcements.

 Issues on existing or absent documentation can be filed in
 [GitHub Issues](https://github.com/ROCm/ROCm/issues).
-
-## Email
-
-Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
--- a/docs/data/how-to/framework_install_2024_05_23.png
+++ b/docs/data/how-to/framework_install_2024_05_23.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
--- a/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
--- a/docs/data/how-to/magma005.png
+++ b/docs/data/how-to/magma005.png
--- a/docs/data/rocm-software-stack-6_1_0.jpg
+++ b/docs/data/rocm-software-stack-6_1_0.jpg
--- a/docs/how-to/deep-learning-rocm.md
+++ b/docs/how-to/deep-learning-rocm.md
@@ -1,22 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="Deep learning using ROCm">
-  <meta name="keywords" content="deep learning, frameworks, installation, PyTorch, TensorFlow,
-  MAGMA, AMD, ROCm">
-</head>
-
-# Deep learning guide
-
-The following sections cover the different framework installations for ROCm and
-deep-learning applications. The following image provides
-the sequential flow for the use of each framework. Refer to the ROCm Compatible
-Frameworks Release Notes for each framework's most current release notes at
-{doc}`Third-party support<rocm-install-on-linux:reference/3rd-party-support-matrix>`.
-
-![ROCm Compatible Frameworks Flowchart](../data/how-to/magma005.png "ROCm Compatible Frameworks")
-
-## Frameworks installation
-
-* {doc}`PyTorch for ROCm<rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
-* {doc}`TensorFlow for ROCm<rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
-* {doc}`MAGMA for ROCm<rocm-install-on-linux:how-to/3rd-party/magma-install>`
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -0,0 +1,68 @@
+.. meta::
+   :description: How to install deep learning frameworks for ROCm
+   :keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
+
+********************************************
+Installing deep learning frameworks for ROCm
+********************************************
+
+ROCm provides a comprehensive ecosystem for deep learning development, including
+:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
+deep learning frameworks and libraries such as PyTorch, TensorFlow, JAX, and MAGMA. ROCm works closely with these
+frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
+
+The following guides cover installation processes for ROCm-aware deep learning frameworks.
+
+.. grid::
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+
+   .. grid-item::
+      :columns: 3
+
+      :doc:`MAGMA for ROCm <rocm-install-on-linux:how-to/3rd-party/magma-install>`
+
+   .. grid-item::
+      :columns: 3
+
+   .. grid-item::
+      :columns: 3
+
+The following chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
+
+.. image:: ../data/how-to/framework_install_2024_05_23.png
+   :alt: Flowchart for installing ROCm-aware machine learning frameworks
+   :align: center
+
+Find information on version compatibility and framework release notes in :doc:`Third-party support matrix
+<rocm-install-on-linux:reference/3rd-party-support-matrix>`.
+
+.. note::
+
+   For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
+
+Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
+through the following guides.
+
+* :doc:`rocm-for-ai/index`
+
+* :doc:`llm-fine-tuning-optimization/index`
--- a/docs/how-to/gpu-enabled-mpi.rst
+++ b/docs/how-to/gpu-enabled-mpi.rst
@@ -260,5 +260,5 @@ To run an OSU benchmark using multiple nodes, use the following code:
 .. code-block:: shell

    export LD_LIBRARY_PATH=$OMPI_DIR/lib:$OFI_DIR/lib64:/opt/rocm/lib
-    $OMPI_DIR/bin/mpirun -np 2 \
+    $OMPI_DIR/bin/mpirun --mca pml ob1 --mca btl_ofi_mode 2 -np 2 \
    ./c/mpi/pt2pt/standard/osu_bw D D
--- a/docs/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.rst
@@ -0,0 +1,20 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, inference, usage, tutorial
+
+*************************
+Fine-tuning and inference
+*************************
+
+Fine-tuning using ROCm involves leveraging AMD's GPU-accelerated :doc:`libraries <rocm:reference/api-libraries>` and
+:doc:`tools <rocm:reference/rocm-tools>` to optimize and train deep learning models. ROCm provides a comprehensive
+ecosystem for deep learning development, including open-source libraries for optimized deep learning operations and
+ROCm-aware versions of :doc:`deep learning frameworks <../deep-learning-rocm>` such as PyTorch, TensorFlow, and JAX.
+
+Single-accelerator systems, such as a machine equipped with a single accelerator or GPU, are commonly used for
+smaller-scale deep learning tasks, including fine-tuning pre-trained models and running inference on moderately
+sized datasets. See :doc:`single-gpu-fine-tuning-and-inference`.
+
+Multi-accelerator systems, on the other hand, consist of multiple accelerators working in parallel. These systems are
+typically used in LLMs and other large-scale deep learning tasks where performance, scalability, and the handling of
+massive datasets are crucial. See :doc:`multi-gpu-fine-tuning-and-inference`.
--- a/docs/how-to/llm-fine-tuning-optimization/index.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/index.rst
@@ -0,0 +1,37 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial
+
+*******************************************
+Fine-tuning LLMs and inference optimization
+*******************************************
+
+ROCm empowers the fine-tuning and optimization of large language models, making them accessible and efficient for
+specialized tasks. ROCm supports the broader AI ecosystem to ensure seamless integration with open frameworks,
+models, and tools.
+
+For more information, see `What is ROCm? <https://rocm.docs.amd.com/en/latest/what-is-rocm.html>`_
+
+Throughout the following topics, this guide discusses the goals and :ref:`challenges of fine-tuning a large language
+model <fine-tuning-llms-concept-challenge>` like Llama 2. Then, it introduces :ref:`common methods of optimizing your
+fine-tuning <fine-tuning-llms-concept-optimizations>` using techniques like LoRA with libraries like PEFT. In the
+sections that follow, you'll find practical guides on libraries and tools to accelerate your fine-tuning.
+
+- :doc:`Conceptual overview of fine-tuning LLMs <overview>`
+
+- :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
+  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
+  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
+
+- :doc:`Model quantization <model-quantization>`
+
+- :doc:`Model acceleration libraries <model-acceleration-libraries>`
+
+- :doc:`LLM inference frameworks <llm-inference-frameworks>`
+
+- :doc:`Optimizing with Composable Kernel <optimizing-with-composable-kernel>`
+
+- :doc:`Optimizing Triton kernels <optimizing-triton-kernel>`
+
+- :doc:`Profiling and debugging <profiling-and-debugging>`
+
--- a/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
@@ -0,0 +1,211 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, inference, vLLM, TGI, text generation inference
+
+************************
+LLM inference frameworks
+************************
+
+This section discusses how to implement `vLLM <https://docs.vllm.ai/en/latest>`_ and `Hugging Face TGI
+<https://huggingface.co/docs/text-generation-inference/en/index>`_ using
+:doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` and
+:doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` systems.
+
+.. _fine-tuning-llms-vllm:
+
+vLLM inference
+==============
+
+vLLM is renowned for its paged attention algorithm that can reduce memory consumption and increase throughput thanks to
+its paging scheme. Instead of allocating GPU high-bandwidth memory (HBM) for the maximum output token lengths of the
+models, the paged attention of vLLM allocates GPU HBM dynamically for its actual decoding lengths. This paged attention
+is also effective when multiple requests share the same key and value contents for a large value of beam search or
+multiple parallel requests.
+
+vLLM also incorporates many modern LLM acceleration and quantization algorithms, such as Flash Attention, HIP and CUDA
+graphs, tensor parallel multi-GPU, GPTQ, AWQ, and token speculation.
+
+Installing vLLM
+---------------
+
+.. _fine-tuning-llms-vllm-rocm-docker-image:
+
+1. Run the following commands to build a Docker image ``vllm-rocm``.
+
+   .. code-block:: shell
+
+      git clone https://github.com/vllm-project/vllm.git
+      cd vllm
+      docker build -f Dockerfile.rocm -t vllm-rocm .
+
+.. tab-set::
+
+   .. tab-item:: vLLM on a single-accelerator system
+      :sync: single
+
+      2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
+         Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
+
+         .. code-block:: shell
+
+            docker run -it \
+               --network=host \
+               --group-add=video \
+               --ipc=host \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --device /dev/kfd \
+               --device /dev/dri \
+               -v <path/to/model>:/app/model \
+               vllm-rocm \
+               bash
+
+      3. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.
+
+         .. code-block:: shell
+
+            python -m vllm.entrypoints.api_server --model /app/model --dtype float16 --port 8000 &
+
+         The following log message is displayed in your command line indicates that the server is listening for requests.
+
+         .. image:: ../../data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
+            :alt: vLLM API server log message
+            :align: center
+
+      4. To test, send it a curl request containing a prompt.
+
+         .. code-block:: shell
+
+            curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
+
+         You should receive a response like the following.
+
+         .. code-block:: text
+
+            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
+
+   .. tab-item:: vLLM on a multi-accelerator system
+      :sync: multi
+
+      2. To use vLLM as an API server to serve reference requests, first start a container using the :ref:`vllm-rocm
+         Docker image <fine-tuning-llms-vllm-rocm-docker-image>`.
+
+         .. code-block:: shell
+
+            docker run -it \
+               --network=host \
+               --group-add=video \
+               --ipc=host \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --device /dev/kfd \
+               --device /dev/dri \
+               -v <path/to/model>:/app/model \
+               vllm-rocm \
+               bash
+
+
+      3. To run API server on multiple GPUs, use the ``-tp``  or ``--tensor-parallel-size``  parameter. For example, to use two
+         GPUs, start the API server using the following command.
+
+         .. code-block:: shell
+
+            python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &
+
+      4. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
+         isolate each instance to a different accelerator.
+
+         For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
+         a command like the following.
+
+         .. code-block:: shell
+
+            ROCR_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2 --port 8000 &
+            ROCR_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model /data/llama-2-7b-chat-hf --dtype float16 –tp 2--port 8001 &
+
+      5. To test, send it a curl request containing a prompt.
+
+         .. code-block:: shell
+
+            curl http://localhost:8000/generate -H "Content-Type: application/json" -d '{"prompt": "What is AMD Instinct?", "max_tokens": 80, "temperature": 0.0 }'
+
+         You should receive a response like the following.
+
+         .. code-block:: text
+
+            {"text":["What is AMD Instinct?\nAmd Instinct is a brand new line of high-performance computing (HPC) processors from Advanced Micro Devices (AMD). These processors are designed to deliver unparalleled performance for HPC workloads, including scientific simulations, data analytics, and machine learning.\nThe Instinct lineup includes a range of processors, from the entry-level Inst"]}
+
+.. _fine-tuning-llms-tgi:
+
+Hugging Face TGI
+================
+
+Text Generation Inference (TGI) is LLM serving framework from Hugging
+Face, and it also supports the majority of high-performance LLM
+acceleration algorithms such as Flash Attention, Paged Attention,
+CUDA/HIP graph, tensor parallel multi-GPU, GPTQ, AWQ, and token
+speculation.
+
+.. tip::
+
+   In addition to LLM serving capability, TGI also provides the `Text Generation Inference benchmarking tool
+   <https://github.com/huggingface/text-generation-inference/blob/main/benchmark/README.md>`_.
+
+Install TGI
+-----------
+
+1. Launch the TGI Docker container in the host machine.
+
+   .. code-block:: shell
+
+      docker run --name tgi --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+      --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 256g
+      --net host -v $PWD:/data
+      --entrypoint "/bin/bash"
+      --env HUGGINGFACE_HUB_CACHE=/data
+      ghcr.io/huggingface/text-generation-inference:latest-rocm
+
+.. tab-set::
+
+   .. tab-item:: TGI on a single-accelerator system
+      :sync: single
+
+      2. Inside the container, launch a model using TGI server on a single accelerator.
+
+         .. code-block:: shell
+
+            export ROCM_USE_FLASH_ATTN_V2_TRITON=True
+            text-generation-launcher --model-id NousResearch/Meta-Llama-3-70B --dtype float16 --port 8000 &
+
+      3. To test, send it a curl request containing a prompt.
+
+         .. code-block:: shell
+
+            curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'
+
+         You should receive a response like the following.
+
+         .. code-block:: shell
+
+            data:{"index":20,"token":{"id":304,"text":" in","logprob":-1.2822266,"special":false},"generated_text":" AMD Instinct is a new family of data center GPUs designed to accelerate the most demanding workloads in","details":null}
+
+   .. tab-item:: TGI on a multi-accelerator system
+
+      2. Inside the container, launch a model using TGI server on multiple accelerators (4 in this case).
+
+         .. code-block:: shell
+
+            export ROCM_USE_FLASH_ATTN_V2_TRITON=True
+            text-generation-launcher --model-id NousResearch/Meta-Llama-3-8B --dtype float16 --port 8000 --num-shard 4 &
+
+      3. To test, send it a curl request containing a prompt.
+
+         .. code-block:: shell
+
+            curl http://localhost:8000/generate_stream -X POST -d '{"inputs":"What is AMD Instinct?","parameters":{"max_new_tokens":20}}' -H 'Content-Type: application/json'
+
+         You should receive a response like the following.
+
+         .. code-block:: shell
+
+            data:{"index":20,"token":{"id":304,"text":" in","logprob":-1.2773438,"special":false},"generated_text":" AMD Instinct is a new family of data center GPUs designed to accelerate the most demanding workloads in","details":null}
--- a/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
@@ -0,0 +1,251 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, Flash Attention, Hugging Face, xFormers, vLLM, PyTorch
+
+****************************
+Model acceleration libraries
+****************************
+
+This section discusses model acceleration techniques and libraries to improve memory efficiency and performance.
+
+Flash Attention 2
+=================
+
+Flash Attention is a technique designed to reduce memory movements between GPU SRAM and high-bandwidth memory (HBM). By
+using a tiling approach, Flash Attention 2 improves memory locality in the nested loops of query, key, and value
+computations within the Attention modules of LLMs. These modules include Multi-Head Attention (MHA), Group-Query
+Attention (GQA), and Multi-Query Attention (MQA). This reduction in memory movements significantly decreases the
+time-to-first-token (TTFT) latency for large batch sizes and long prompt sequences, thereby enhancing overall
+performance.
+
+.. image:: ../../data/how-to/llm-fine-tuning-optimization/attention-module.png
+   :alt: Attention module of a large language module utilizing tiling
+   :align: center
+
+Installing Flash Attention 2 
+----------------------------
+
+ROCm provides two different implementations of Flash Attention 2 modules. They can be deployed interchangeably:
+
+*  ROCm `Composable Kernel <https://github.com/ROCm/composable_kernel/tree/develop/example/01_gemm>`_
+   (CK) Flash Attention 2
+
+*  `OpenAI Triton <https://triton-lang.org/main/index.html>`_ Flash Attention 2
+
+.. tab-set::
+
+   .. tab-item:: CK Flash Attention 2
+
+      To install CK Flash Attention 2, use the following commands.
+
+      .. code-block:: shell
+
+         # Install from source
+         git clone https://github.com/ROCm/flash-attention.git
+         cd flash-attention/
+         GPU_ARCHS=gfx942 python setup.py install #MI300 series
+
+      Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
+      ``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
+
+      .. code-block:: python
+
+         import torch
+         from transformers import AutoModelForCausalLM, AutoTokenizer
+         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+         model_name = "NousResearch/Meta-Llama-3-8B"
+
+         tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16, use_fast=False)
+         inputs = tokenizer('Today is', return_tensors='pt').to(device)
+
+         model_eager = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager").cuda(device)
+         model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda(device)
+
+         print("eager GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
+         print("ckFAv2 GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
+
+         #  eager GQA:  Today is the day of the Lord, and we are the
+         # ckFAv2 GQA: Today is the day of the Lord, and we are the
+
+   .. tab-item:: Triton Flash Attention 2
+
+      The Triton Flash Attention 2 module is implemented in Python and uses OpenAI’s JIT compiler. This module has been
+      upstreamed into the vLLM serving toolkit, discussed in :doc:'llm-inference-frameworks'. 
+
+      1. To install Triton Flash Attention 2 and run the benchmark, use the following commands.
+
+         .. code-block:: shell
+
+            # Install from the source
+            pip uninstall pytorch-triton-rocm triton -y
+            git clone https://github.com/ROCm/triton.git 
+            cd triton/python
+            GPU_ARCHS=gfx942 python setup.py install #MI300 series
+            pip install matplotlib pandas
+
+      2. To test, run the Triton Flash Attention 2 performance benchmark.
+
+         .. code-block:: shell
+         
+            # Test the triton FA v2 kernel
+            python https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py
+            # Results (Okay to release TFLOPS number ???)
+            fused-attention-fwd-d128:
+                BATCH    HQ    HK  N_CTX_Q  N_CTX_K      TFLOPS
+            0    16.0  16.0  16.0   1024.0   1024.0  287.528411
+            1     8.0  16.0  16.0   2048.0   2048.0  287.490806
+            2     4.0  16.0  16.0   4096.0   4096.0  345.966031
+            3     2.0  16.0  16.0   8192.0   8192.0  361.369510
+            4     1.0  16.0  16.0  16384.0  16384.0  356.873720
+            5     2.0  48.0  48.0   1024.0   1024.0  216.916235
+            6     2.0  48.0  48.0   2048.0   1024.0  271.027578
+            7     2.0  48.0  48.0   4096.0   8192.0  337.367372
+            8     2.0  48.0  48.0   8192.0   4096.0  363.481649
+            9     2.0  48.0  48.0  16384.0   8192.0  375.013622
+            10    8.0  16.0  16.0   1989.0  15344.0  321.791333
+            11    4.0  16.0  16.0   4097.0    163.0  122.104888
+            12    2.0  16.0  16.0   8122.0   2159.0  337.060283
+            13    1.0  16.0  16.0  16281.0      7.0    5.234012
+            14    2.0  48.0  48.0   1021.0   1020.0  214.657425
+            15    2.0  48.0  48.0   2001.0   2048.0  314.429118
+            16    2.0  48.0  48.0   3996.0   9639.0  330.411368
+            17    2.0  48.0  48.0   8181.0   1021.0  324.614980
+
+xFormers
+========
+
+xFormers also improves the performance of attention modules. Although xFormers attention performs very
+similarly to Flash Attention 2 due to its tiling behavior of query, key, and value, it’s widely used for LLMs and
+Stable Diffusion models with the Hugging Face Diffusers library.
+
+Installing CK xFormers 
+----------------------
+
+Use the following commands to install CK xFormers.
+
+.. code-block:: shell
+   
+   # Install from source
+   git clone https://github.com/ROCm/xformers.git
+   cd xformers/
+   git submodule update --init --recursive
+   PYTORCH_ROCM_ARCH=gfx942 python setup.py install #Instinct MI300-series
+
+PyTorch built-in acceleration
+=============================
+
+`PyTorch compilation
+mode <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`__
+synthesizes the model into a graph and then lowers it to prime
+operators. These operators are compiled using TorchInductor, which uses
+OpenAI Triton as a building block for GPU acceleration. One advantage of
+PyTorch compilation mode is that its GPU kernels are written in Python,
+making modifying and extending them easier. PyTorch compilation mode
+often delivers higher performance, as model operations are fused before
+runtime, which allows for easy deployment of high-performance kernels.
+
+PyTorch compilation
+-------------------
+
+To utilize the PyTorch compilation mode, specific layers of the model
+must be explicitly assigned as compilation targets. In the case of LLM,
+where autoregressive token decoding generates dynamically changing
+key/value sizes, limiting the key/value size to a static dimension,
+``max_cache_length``, is necessary to utilize the performance benefits
+of the PyTorch compilation.
+
+.. code-block:: python
+
+   # Sample script to run LLM with the static key-value cache and PyTorch compilation
+   from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
+   import torch
+   from typing import Optional
+   import os
+   device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+   os.environ["TOKENIZERS_PARALLELISM"] = "false"
+   model_name = "NousResearch/Meta-Llama-3-8B"
+   prompts = []
+   
+   for b in range(1):
+       prompts.append("New york city is where "
+   )
+   
+   tokenizer = AutoTokenizer.from_pretrained(model_name)
+   model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
+   inputs = tokenizer(prompts, return_tensors="pt").to(model.device)
+   
+   def decode_one_tokens(model, cur_token, input_pos, cache_position):
+       logits = model(cur_token, position_ids=input_pos, cache_position=cache_position, return_dict=False, use_cache=True)[0]
+       new_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
+       return new_token
+   
+   batch_size, seq_length = inputs["input_ids"].shape
+
+   # Static key-value cache
+   max_cache_length = 1024
+   max_new_tokens = 10
+   model._setup_cache(StaticCache, batch_size, max_cache_len=max_cache_length)
+   cache_position = torch.arange(seq_length, device=device)
+   generated_ids = torch.zeros(batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=device)
+   generated_ids[:, cache_position] = inputs["input_ids"].to(device).to(torch.int)
+   
+   logits = model(**inputs, cache_position=cache_position, return_dict=False, use_cache=True)[0]
+   next_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
+
+   # torch compilation
+   decode_one_tokens = torch.compile(decode_one_tokens, mode="max-autotune-no-cudagraphs",fullgraph=True)
+   
+   generated_ids[:, seq_length] = next_token[:, 0]
+   cache_position = torch.tensor([seq_length + 1], device=device)
+   
+   with torch.no_grad():
+       for _ in range(1, max_new_tokens):
+           with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+               next_token = decode_one_tokens(model, next_token.clone(), None, cache_position)
+               generated_ids[:, cache_position] = next_token.int()
+           cache_position += 1
+
+.. _fine-tuning-llms-pytorch-tunableop:
+
+PyTorch TunableOp
+------------------
+
+ROCm PyTorch (2.2.0 and later) allows users to use high-performance ROCm
+GEMM kernel libraries through PyTorch's built-in TunableOp options.
+This enables users to automatically pick up the best-performing GEMM
+kernels from :doc:`rocBLAS <rocblas:index>` and :doc:`hipBLASLt <hipblaslt:index>` libraries during runtime.
+
+During warm-up runs or offline profiling steps, users can create a GEMM Table
+that enumerates the kernel information. During the model's run, the best-performing kernel substitutes
+``torch.nn.functional.linear(input, weight, bias=None)`` with the kernel specified in the GEMM table. The
+`Tunable GitHub <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
+page describes the options.
+
+.. code-block:: python
+
+   # To turn on TunableOp, simply set this environment variable
+   export PYTORCH_TUNABLEOP_ENABLED=1
+   
+   # Python
+   import torch
+   import torch.nn as nn
+   import torch.nn.functional as F
+   A = torch.rand(100, 20, device="cuda")
+   W = torch.rand(200, 20, device="cuda")
+   Out = F.linear(A, W)
+   print(Out.size())
+   
+   # tunableop_results0.csv
+   Validator,PT_VERSION,2.4.0
+   Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c
+   Validator,HIPBLASLT_VERSION,0.7.0-1549b021
+   Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
+   Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty
+   GemmTunableOp_float_TN,tn_200_100_20,Gemm_Rocblas_32323,0.00669595
+
+.. image:: ../../data/how-to/llm-fine-tuning-optimization/tunableop.png
+   :alt: GEMM and TunableOp
+   :align: center
+
+Learn more about optimizing kernels with TunableOp in
+:ref:`Optimizing Triton kernels <fine-tuning-llms-triton-tunableop>`.
--- a/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
@@ -0,0 +1,259 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, quantization, GPTQ, transformers, bitsandbytes
+
+*****************************
+Model quantization techniques
+*****************************
+
+Quantization reduces the model size compared to its native full-precision version, making it easier to fit large models
+onto accelerators or GPUs with limited memory usage. This section explains how to perform LLM quantization using GPTQ
+and bitsandbytes on AMD Instinct hardware.
+
+.. _fine-tune-llms-gptq:
+
+GPTQ
+====
+
+GPTQ is a post-training quantization technique where each row of the weight matrix is quantized independently to find a
+version of the weights that minimizes error. These weights are quantized to ``int4`` but are restored to ``fp16`` on the
+fly during inference. This can save your memory usage by a factor of four. A speedup in inference is expected because
+inference of GPTQ models uses a lower bit width, which takes less time to communicate.
+
+Before setting up the GPTQ configuration in Transformers, ensure the `AutoGPTQ <https://github.com/AutoGPTQ/AutoGPTQ>`_ library
+is installed.
+
+Installing AutoGPTQ
+-------------------
+
+The AutoGPTQ library implements the GPTQ algorithm.
+
+#. Use the following command to install the latest stable release of AutoGPTQ from pip.
+
+   .. code-block:: shell
+
+      # This will install pre-built wheel for a specific ROCm version.
+      
+      pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/
+
+   Or, install AutoGPTQ from source for the appropriate ROCm version (for example, ROCm 6.1).
+
+   .. code-block:: shell
+
+      # Clone the source code.
+      git clone https://github.com/AutoGPTQ/AutoGPTQ.git
+      cd AutoGPTQ
+      
+      # Speed up the compilation by specifying PYTORCH_ROCM_ARCH to target device.
+      PYTORCH_ROCM_ARCH=gfx942 ROCM_VERSION=6.1 pip install .
+      
+      # Show the package after the installation 
+
+#. Run ``pip show auto-gptq`` to print information for the installed ``auto-gptq`` package. Its output should look like
+   this:
+
+   .. code-block:: shell
+
+      Name: auto-gptq
+      Version: 0.8.0.dev0+rocm6.1
+      ...
+
+Using GPTQ with AutoGPTQ
+------------------------
+
+#. Run the following code snippet.
+
+   .. code-block:: python
+
+         from transformers import AutoTokenizer, TextGenerationPipeline
+         from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+         base_model_name = "NousResearch/Llama-2-7b-hf"
+         quantized_model_name = "llama-2-7b-hf-gptq"
+         tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
+         examples = [
+             tokenizer(
+                 "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+             )
+         ]
+         print(examples)
+
+   The resulting examples should be a list of dictionaries whose keys are ``input_ids`` and ``attention_mask``.
+
+#. Set up the quantization configuration using the following snippet.
+
+   .. code-block:: python
+
+      quantize_config = BaseQuantizeConfig(
+          bits=4,  		# quantize model to 4-bit
+          group_size=128,  	# it is recommended to set the value to 128
+          desc_act=False,  
+      )
+
+#. Load the non-quantized model using the AutoGPTQ class and run the quantization.
+
+   .. code-block:: python
+
+      # Import auto_gptq class.
+      from auto_gptq import AutoGPTQForCausalLM
+
+      # Load non-quantized model.
+      base_model = AutoGPTQForCausalLM.from_pretrained(base_model_name, quantize_config, device_map = "auto")
+      base_model.quantize(examples)
+
+      # Save quantized model.
+      base_model.save_quantized(quantized_model_name)
+
+Using GPTQ with Hugging Face Transformers
+------------------------------------------
+
+#. To perform a GPTQ quantization using Hugging Face Transformers, you need to create a ``GPTQConfig`` instance and set the
+   number of bits to quantize to, and a dataset to calibrate the weights.
+
+   .. code-block:: python
+
+      from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+      
+      base_model_name = " NousResearch/Llama-2-7b-hf"
+      tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+      gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
+
+#. Load a model to quantize using ``AutoModelForCausalLM`` and pass the
+   ``gptq_config`` to its ``from_pretained`` method. Set ``device_map=”auto”`` to
+   automatically offload the model to available GPU resources.
+
+   .. code-block:: python
+
+      quantized_model = AutoModelForCausalLM.from_pretrained(
+                              base_model_name, 
+                              device_map="auto", 
+                              quantization_config=gptq_config)
+
+#. Once the model is quantized, you can push the model and tokenizer to Hugging Face Hub for easy share and access.
+
+   .. code-block:: python
+
+      quantized_model.push_to_hub("llama-2-7b-hf-gptq")
+      tokenizer.push_to_hub("llama-2-7b-hf-gptq")
+
+   Or, you can save the model locally using the following snippet.
+
+   .. code-block:: python
+
+      quantized_model.save_pretrained("llama-2-7b-gptq")
+      tokenizer.save_pretrained("llama-2-7b-gptq")
+
+ExLlama-v2 support
+------------------
+
+ExLlama is a Python/C++/CUDA implementation of the Llama model that is
+designed for faster inference with 4-bit GPTQ weights. The ExLlama
+kernel is activated by default when users create a ``GPTQConfig`` object. To
+boost inference speed even further on Instinct accelerators, use the ExLlama-v2
+kernels by configuring the ``exllama_config`` parameter as the following.
+
+.. code-block:: python
+
+   from transformers import AutoModelForCausalLM, GPTQConfig
+   pretrained_model_dir = "meta-llama/Llama-2-7b"
+   gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
+   quantized_model = AutoModelForCausalLM.from_pretrained(
+                           base_model_name, 
+                           device_map="auto", 
+                           quantization_config=gptq_config)
+
+bitsandbytes
+============
+
+The `ROCm-aware bitsandbytes <https://github.com/ROCm/bitsandbytes>`_ library is
+a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizer, matrix multiplication, and
+8-bit and 4-bit quantization functions. The library includes quantization primitives for 8-bit and 4-bit operations
+through ``bitsandbytes.nn.Linear8bitLt`` and ``bitsandbytes.nn.Linear4bit`` and 8-bit optimizers through the
+``bitsandbytes.optim`` module. These modules are supported on AMD Instinct accelerators.
+
+Installing bitsandbytes
+-----------------------
+
+#. To install bitsandbytes for ROCm 6.0 (and later), use the following commands.
+
+   .. code-block:: shell
+
+      # Clone the github repo
+      git clone --recurse https://github.com/ROCm/bitsandbytes.git
+      cd bitsandbytes
+      git checkout rocm_enabled
+
+      # Install dependencies 
+      pip install -r requirements-dev.txt
+
+      # Use -DBNB_ROCM_ARCH to specify target GPU arch
+      cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
+
+      # Install 
+      python setup.py install
+
+#. Run ``pip show bitsandbytes`` to show the information about the installed bitsandbytes package. Its output should
+   look like the following.
+
+   .. code-block:: shell
+
+      Name: bitsandbytes
+      Version: 0.44.0.dev0
+      ...
+
+Using bitsandbytes primitives
+-----------------------------
+
+To get started with bitsandbytes primitives, use the following code as reference.
+
+.. code-block:: python
+
+   import bitsandbytes as bnb
+   
+   # Use Int8 Matrix Multiplication
+   bnb.matmul(..., threshold=6.0)
+   
+   # Use bitsandbytes 8-bit Optimizers
+   adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
+
+Using bitsandbytes with Hugging Face Transformers
+-------------------------------------------------
+
+To load a Transformers model in 4-bit, set ``load_int_4bt=true`` in ``BitsAndBytesConfig``.
+
+.. code-block:: python
+
+   from transformers import AutoModelForCausalLM
+   from bitsandbytes import BitsAndBytesConfig
+   
+   base_model_name = "NousResearch/Llama-2-7b-hf"
+   quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+   bnb_model_4bit = AutoModelForCausalLM.from_pretrained(
+           base_model_name, 
+           device_map="auto", 
+           quantization_config=quantization_config)
+   
+   # Check the memory footprint with get_memory_footprint method
+   print(bnb_model_4bit.get_memory_footprint())
+
+To load a model in 8-bit for inference, use the ``load_in_8bit`` option.
+
+.. code-block:: python
+
+   from transformers import AutoModelForCausalLM, AutoTokenizer
+   from bitsandbytes import BitsAndBytesConfig
+   
+   base_model_name = "NousResearch/Llama-2-7b-hf"
+   
+   tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+   quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+   tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+   bnb_model_8bit = AutoModelForCausalLM.from_pretrained(
+           base_model_name, 
+           device_map="auto", 
+           quantization_config=quantization_config)
+   
+   prompt = "What is a large language model?"
+   inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+   generated_ids = model.generate(**inputs)
+   outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
--- a/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
@@ -0,0 +1,236 @@
+.. meta::
+   :description: Model fine-tuning and inference on a multi-GPU system
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, multi-GPU, distributed, inference
+
+*****************************************************
+Fine-tuning and inference using multiple accelerators
+*****************************************************
+
+This section explains how to fine-tune a model on a multi-accelerator system. See
+:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single accelerator or GPU setup.
+
+.. _fine-tuning-llms-multi-gpu-env:
+
+Environment setup
+=================
+
+This section was tested using the following hardware and software environment.
+
+.. list-table::
+   :stub-columns: 1
+
+   * - Hardware
+     - 4 AMD Instinct MI300X accelerators
+
+   * - Software
+     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
+
+   * - Libraries
+     - ``transformers`` ``datasets`` ``accelerate`` ``huggingface-hub`` ``peft`` ``trl`` ``scipy``
+
+   * - Base model
+     - ``meta-llama/Llama-2-7b-chat-hf``
+
+.. _fine-tuning-llms-multi-gpu-env-setup:
+
+Setting up the base implementation environment
+----------------------------------------------
+
+#. Install PyTorch for ROCm. Refer to the
+   :doc:`PyTorch installation guide <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`. For consistent
+   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.
+
+#. In the Docker container, check the availability of ROCM-capable accelerators using the following command.
+
+   .. code-block:: shell
+
+      rocm-smi -showproductname
+
+#. Check that your accelerators are available to PyTorch.
+
+   .. code-block:: python
+
+      import torch
+      print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
+      print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
+
+   If successful, your output should look like this:
+
+   .. code-block:: shell
+
+      >>> print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
+      Is a ROCm-GPU detected?  True
+      >>> print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
+      How many ROCm-GPUs are detected?  4
+
+.. tip::
+
+   During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
+   This tool helps you see shows which accelerators or GPUs are involved.
+
+
+.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
+
+Hugging Face Accelerate for fine-tuning and inference
+===========================================================
+
+`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_ is a library that simplifies turning raw
+PyTorch code for a single accelerator into code for multiple accelerators for LLM fine-tuning and inference. It is
+integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`_ allowing you to scale your PyTorch
+code while maintaining performance and flexibility.
+
+As a brief example of model fine-tuning and inference using multiple GPUs, let's use Transformers and load in the Llama
+2 7B model.
+
+Here, let's reuse the code in :ref:`Single-accelerator fine-tuning <fine-tuning-llms-single-gpu-download-model-dataset>`
+to load the base model and tokenizer.
+
+Now, it's important to adjust how you load the model. Add the ``device_map`` parameter to your base model configuration.
+
+.. code-block:: python
+
+   ...
+   base_model_name = "meta-llama/Llama-2-7b-chat-hf"
+   
+   # Load base model to GPU memory
+   base_model = AutoModelForCausalLM.from_pretrained(
+           base_model_name, 
+           device_map = "auto"
+           trust_remote_code = True)
+   ...
+   # Run training
+   sft_trainer.train()
+
+.. note::
+
+   You can let Accelerate handle the device map computation by setting ``device_map`` to one of the supported options
+   (``"auto"``, ``"balanced"``, ``"balanced_low_0"``, ``"sequential"``).
+
+   It's recommended to set the ``device_map`` parameter to ``“auto”`` to allow Accelerate to automatically and
+   efficiently allocate the model given the available resources (4 accelerators in this case).
+
+   When you have more GPU memory available than the model size, here is the difference between each ``device_map``
+   option:
+
+   * ``"auto"`` and ``"balanced"`` evenly split the model on all available GPUs, making it possible for you to use a
+     batch size greater than 1.
+
+   * ``"balanced_low_0"`` evenly splits the model on all GPUs except the first
+     one, and only puts on GPU 0 what does not fit on the others. This
+     option is great when you need to use GPU 0 for some processing of the
+     outputs, like when using the generate function for Transformers
+     models.
+
+   * ``"sequential"`` will fit what it can on GPU 0, then move on GPU 1 and so forth. Not all GPUs might be used.
+
+After loading the model in this way, the model is fully ready to use the resources available to it.
+
+.. _fine-tuning-llms-multi-gpu-torchtune:
+
+torchtune for fine-tuning and inference
+=============================================
+
+`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-accelerator or
+GPU model fine-tuning and inference with LLMs.
+
+#. Install torchtune using pip.
+
+   .. code-block:: shell
+
+      # Install torchtune with PyTorch release 2.2.2+
+      pip install torchtune
+      
+      # To confirm that the package is installed correctly
+      tune --help
+
+   The output should look like this:
+
+   .. code-block:: shell
+
+      usage: tune [-h] {download,ls,cp,run,validate} ...
+      
+      Welcome to the TorchTune CLI!
+      
+      options:
+        -h, --help            show this help message and exit
+      
+      subcommands:
+        {download,ls,cp,run,validate}
+
+#. torchtune recipes are designed around easily composable components and workable training loops, with minimal abstraction
+   getting in the way of fine-tuning. Run ``tune ls`` to show built-in torchtune configuration recipes.
+
+   .. code-block:: shell
+
+      RECIPE                                   CONFIG
+      full_finetune_single_device              llama2/7B_full_low_memory
+                                               llama3/8B_full_single_device
+                                               mistral/7B_full_low_memory
+      full_finetune_distributed                llama2/7B_full
+                                               llama2/13B_full
+                                               llama3/8B_full
+                                               mistral/7B_full
+                                               gemma/2B_full
+      lora_finetune_single_device              llama2/7B_lora_single_device
+                                               llama2/7B_qlora_single_device
+                                               llama3/8B_lora_single_device
+                                               llama3/8B_qlora_single_device
+                                               llama2/13B_qlora_single_device
+                                               mistral/7B_lora_single_device
+
+   The ``RECIPE`` column shows the easy-to-use and workable fine-tuning and inference recipes for popular fine-tuning
+   techniques (such as LoRA). The ``CONFIG`` column lists the YAML configurations for easily configuring training,
+   evaluation, quantization, or inference recipes.
+
+   The snippet shows the architecture of a model's YAML configuration file:
+
+   .. code-block:: yaml
+
+      # Model arguments
+      model:
+        _component_: torchtune.models.llama2.lora_llama2_7b
+        lora_attn_modules: ['q_proj', 'v_proj']
+        apply_lora_to_mlp: False
+        apply_lora_to_output: False
+        lora_rank: 8
+        lora_alpha: 16
+      
+      tokenizer:
+        _component_: torchtune.models.llama2.llama2_tokenizer
+        path: /tmp/Llama-2-7b-hf/tokenizer.model
+      
+      # Dataset and sampler
+      dataset:
+        _component_: torchtune.datasets.alpaca_cleaned_dataset
+        train_on_input: True
+
+#. This configuration file defines the fine-tuning base model path, data set, hyper-parameters for optimizer and scheduler,
+   and training data type. To download the base model for fine-tuning, run the following command:
+
+   .. code-block:: shell
+
+      tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token
+
+   The output directory argument for ``--output-dir`` should map the model path specified in YAML config file.
+
+#. To launch ``lora_finetune_distributed`` on four devices, run the following
+   command:
+
+   .. code-block:: shell
+
+      tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config llama2/7B_lora
+
+   If successful, you should something like the following output:
+
+   .. code-block:: shell
+
+      INFO:torchtune.utils.logging:FSDP is enabled. Instantiating Model on CPU for Rank 0 ...
+      INFO:torchtune.utils.logging:Model instantiation took 7.32 secs
+      INFO:torchtune.utils.logging:Memory Stats after model init:
+      {'peak_memory_active': 9.478172672, 'peak_memory_alloc': 8.953868288, 'peak_memory_reserved': 11.112808448}
+      INFO:torchtune.utils.logging:Optimizer and loss are initialized.
+      INFO:torchtune.utils.logging:Dataset and Sampler are initialized.
+      INFO:torchtune.utils.logging:Learning rate scheduler is initialized.
+      1|111|Loss: 1.5790324211120605:   7%|█                                          | 114/1618
+
+Read more about inference frameworks in :doc:`LLM inference frameworks <llm-inference-frameworks>`.
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
@@ -0,0 +1,383 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, Triton, kernel, performance, optimization
+
+*************************
+Optimizing Triton kernels
+*************************
+
+This section introduces the general steps for `Triton <https://openai.com/index/triton/>`_ kernel optimization. Broadly,
+Triton kernel optimization is similar to HIP and CUDA kernel optimization.
+
+.. _fine-tuning-llms-triton-memory-access-efficiency:
+
+Memory access efficiency
+========================
+
+The accelerator or GPU contains global memory, local data share (LDS), and registers. Global memory has high access
+latency, but is large. LDS access has much lower latency, but is smaller. Register access is the fastest yet smallest
+among the three.
+
+So, the data in global memory should be loaded and stored as few times as possible. If different threads in a block
+need to access the same data, these data should be first transferred from global memory to LDS, then accessed by
+different threads in a workgroup.
+
+.. _fine-tuning-llms-triton-hardware-resource-utilization:
+
+Hardware resource utilization
+=============================
+
+Each accelerator or GPU has multiple Compute Units (CUs) and various CUs do computation in parallel. So, how many CUs
+can a compute kernel can allocate its task to? For the :doc:`AMD MI300X accelerator <../../reference/gpu-arch-specs>`, the
+grid should have at least 1024 thread blocks or workgroups.
+
+.. figure:: ../../data/how-to/llm-fine-tuning-optimization/compute-unit.png
+
+   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.
+
+To increase hardware utilization and maximize parallelism, it is necessary to design algorithms that can exploit more
+parallelism. One approach to achieving this is by using larger split-K techniques for General Matrix Multiply (GEMM)
+operations, which can further distribute the computation across more CUs, thereby enhancing performance.
+
+.. tip::
+
+   You can query hardware resources with the command ``rocminfo`` (in the ``/opt/rocm/bin`` directory). For instance,
+   query the number of CUs, number of SIMD, and wavefront size using the following commands.
+
+   .. code-block:: shell
+
+      rocminfo | grep "Compute Unit"
+
+      rocminfo | grep "SIMD"
+
+      rocminfo | grep "Wavefront Size"
+
+   On an MI300X device, there are 304 CUs, 4 SIMD per CU, and the wavefront size (warp size) is 64. See :doc:`Hardware
+   specifications <../../reference/gpu-arch-specs>` for a full list of AMD accelerators and GPUs.
+
+.. _fine-tuning-llms-triton-ir-analysis:
+
+IR analysis
+===========
+
+In Triton, there are several layouts including *blocked*, *shared*, *sliced*, and *MFMA*.
+
+From the Triton GPU IR (intermediate representation), you can know in which memory each computation is
+performed. The following is a snippet of IR from the Flash Attention decode ``int4`` key-value program. It is to
+de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
+
+.. code-block::
+
+   %190 = tt.load %189 {cache = 1 : i32, evict = 1 : i32, isVolatile =
+   false} : tensor<1x64xi32, #blocked6> loc(#loc159)
+
+   %266 = arith.andi %190, %cst_28 : tensor<1x64xi32, #blocked6>
+   loc(#loc250)
+
+   %267 = arith.trunci %266 : tensor<1x64xi32, #blocked6> to
+   tensor<1x64xi16, #blocked6> loc(#loc251)
+
+   %268 = tt.bitcast %267 : tensor<1x64xi16, #blocked6> -> tensor<1x64xf16,
+   #blocked6> loc(#loc252)
+
+   %269 = triton_gpu.convert_layout %268 : (tensor<1x64xf16, #blocked6>) ->
+   tensor<1x64xf16, #shared1> loc(#loc252)
+
+   %270 = tt.trans %269 : (tensor<1x64xf16, #shared1>) -> tensor<64x1xf16,
+   #shared2> loc(#loc194)
+
+   %276 = triton_gpu.convert_layout %270 : (tensor<64x1xf16, #shared2>) ->
+   tensor<64x1xf16, #blocked5> loc(#loc254)
+
+   %293 = arith.mulf %276, %cst_30 : tensor<64x1xf16, #blocked5>
+   loc(#loc254)
+
+   %295 = arith.mulf %292, %294 : tensor<64x32xf16, #blocked5> loc(#loc264)
+
+   %297 = arith.addf %295, %296 : tensor<64x32xf16, #blocked5> loc(#loc255)
+
+   %298 = triton_gpu.convert_layout %297 : (tensor<64x32xf16, #blocked5>)
+   -> tensor<64x32xf16, #shared1> loc(#loc255)
+
+   %299 = tt.trans %298 : (tensor<64x32xf16, #shared1>) ->
+   tensor<32x64xf16, #shared2> loc(#loc196)
+
+   %300 = triton_gpu.convert_layout %299 : (tensor<32x64xf16, #shared2>) ->
+   tensor<32x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth
+   = 4}>> loc(#loc197)
+
+From the IR, you can see ``i32`` data is loaded from global memory to registers. With a few element-wise operations in
+registers, then it is stored in shared memory for the transpose operation, which needs data movement across different
+threads. With the transpose done, it is loaded from LDS to register again, and with a few more element-wise operations,
+they are stored in LDS again. The last step is to load from LDS to registers and convert to the dot-operand layout.
+
+From the IR, you can see that it uses the LDS twice: one for the transpose, and the other to convert the blocked layout
+to a dot-operand layout.
+
+Assembly analysis
+=================
+
+In the ISA, ensure ``global_load_dwordx4`` is used, especially when the
+load happens in a loop.
+
+In most cases, the LDS load and store should use ``_b128`` as well to
+minimize the number of LDS access instructions. Note that upstream (or backend) might not have ``_b128`` LDS read/write,
+so it uses ``_b64``. For most cases, no matter if you use fork or upstream,
+the LDS access should have ``_b64`` vector width.
+
+The AMD ISA has the ``s_waitcnt`` instruction to synchronize the dependency
+of memory access and computations. The ``s_waitcnt`` instruction can
+have two signals, typically in the context of Triton:
+
+* ``lgkmcnt(n):`` `lgkm` stands for LDS, GDS, Constant and Message.
+
+  In this context, it is often related to LDS access. The number ``n`` here means the number of such accesses that can
+  be left out to continue. For example, 0 means all ``lgkm`` access must finish before continuing, and 1 means only 1
+  ``lgkm`` access can be still running asynchronously before proceeding.
+
+* ``vmcnt(n):`` `vm` means vector memory.
+
+  This happens when vector memory is accessed, for example, when global load moves from global memory to vector memory.
+  Again, the number ``n`` here means the number of accesses that can be left out to continue.
+
+Generally recommended guidelines are as follows.
+
+*  Vectorize memory access as much as possible.
+
+*  Ensure synchronization is done efficiently.
+
+*  Overlap of instructions to hide latency, but it requires thoughtful
+   analysis of the algorithms.
+
+*  If you find inefficiencies, you can trace it back to LLVM IR, TTGIR
+   and even TTIR to see where the problem comes from. If you find it
+   during compiler optimization, activate the MLIR dump and check which
+   optimization pass caused the problem.
+
+.. _fine-tuning-llms-triton-kernel-occupancy:
+
+Kernel occupancy
+================
+
+1. Get the VGPR count, search for ``.vgpr_count`` in the ISA (for example, ``N``).
+
+2. Get the allocated LDS following the steps (for example, L for the kernel).
+
+   a. ``export MLIR_ENABLE_DUMP=1``
+
+   b. ``rm -rf ~/.triton/cache``
+
+   c. ``python kernel.py | | grep "triton_gpu.shared = " | tail -n 1``
+
+   d. You should see something like ``triton_gpu.shared = 65536``, indicating 65536 bytes of LDS are allocated for the
+      kernel.
+
+3. Get number of waves per workgroup using the following steps (for example, ``nW``).
+
+   a. ``export MLIR_ENABLE_DUMP=1``
+
+   b. ``rm -rf ~/.triton/cache``
+
+   c. ``python kernel.py | | grep "triton_gpu.num-warps " | tail -n 1``
+
+   d. You should see something like ``“triton_gpu.num-warps" = 8``, indicating 8 waves per workgroup.
+
+4. Compute occupancy limited by VGPR based on N according to the following table. For example, waves per EU as
+   ``occ_vgpr``.
+
+.. _fine-tuning-llms-occupancy-vgpr-table:
+
+.. figure:: ../../data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
+   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
+   :align: center
+
+5. Compute occupancy limited by LDS based on L by: ``occ_lds = floor(65536 / L)``.
+
+6. Then the occupancy is ``occ = min(floor(occ_vgpr * 4 / nW), occ_lds) * nW / 4``
+
+   a. ``occ_vgpr \* 4`` gives the total number of waves on all 4 execution units (SIMDs)
+      per CU.
+
+   b. ``floor(occ_vgpr * 4 / nW)`` gives the occupancy of workgroups per CU
+      regrading VGPR usage.
+
+   c. The true ``occ`` is the minimum of the two.
+
+.. _fine-tuning-llms-triton-kernel-configs-env-vars:
+
+Auto-tunable kernel configurations and environment variables
+============================================================
+
+This section relates to the amount of :ref:`memory access <fine-tuning-llms-triton-memory-access-efficiency>` and
+computation assigned to each CU. It is related to the usage of LDS, registers and the scheduling of different tasks on
+a CU.
+
+The following is a list of kernel arguments used for tuning.
+
+``num_stages=n``
+   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
+   according to the following rules:
+
+   * For kernels with a single GEMM, set to ``0``.
+
+   * For kernels with two GEMMs fused (Flash Attention, or any other kernel
+     that fuses 2 GEMMs), set to ``1``.
+
+   * For kernels that fuse a single GEMM with another non-GEMM operator
+     (for example ReLU activation), set to ``0``.
+
+   * For kernels that have no GEMMs, set to ``1``.
+
+``waves_per_eu=n``
+   Helps to manage Vector General Purpose Registers (VGPR) usage to achieve desired occupancy levels. This argument
+   hints to the compiler to reduce VGPR to achieve ``n`` occupancy. See
+   :ref:`Kernel occupancy <fine-tuning-llms-triton-kernel-occupancy>` for more information about how to compute
+   occupancy. 
+
+   This argument is useful if:
+
+   * The occupancy of the kernel is limited by VGPR usage.
+
+   * The current VGPR usage is only a few above a boundary in
+     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <fine-tuning-llms-occupancy-vgpr-table>`.
+
+   For example, according to the table, the available VGPR is 512 per Execution Unit (EU), and VGPU is allocated at the
+   unit of 16. If the current VGPR usage is 170, the actual requested VGPR will be 176, so the
+   occupancy is only 2 waves per CU since :math:`176 \times 3 > 512`. So, if you set
+   ``waves_per_eu`` to 3, the LLVM backend tries to bring VGPR usage down so
+   that it might fit 3 waves per EU.
+
+``BLOCK_M``, ``BLOCK_N``, ``BLOCK_K``
+   Tile sizes to be tuned to balance the memory-to-computation ratio. You want tile sizes large enough to
+   maximize the efficiency of memory-to-computation ratio, but small enough to parallelize the greatest number of
+   workgroups at the grid level.
+
+``matrix_instr_nonkdim``
+   Experimental feature for Flash Attention-like kernels that determines the size of the Matrix Fused Multiply-Add
+   (MFMA) instruction used.
+
+   -  ``Matrix_instr_nonkdim = 16``: ``mfma_16x16`` is used.
+
+   -  ``Matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.
+
+   For GEMM kernels on an AMD MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
+   tile/GEMM sizes.
+
+The following is an environment variable used for tuning.
+
+``OPTIMIZE_EPILOGUE``
+   Setting this variable to ``1`` can improve performance by removing the ``convert_layout`` operation in the epilogue.
+   It should be turned on (set to ``1``) in most cases. Setting ``OPTIMIZE_EPILOGUE=1`` stores the MFMA instruction
+   results in the MFMA layout directly; this comes at the cost of reduced global store efficiency, but the impact on
+   kernel execution time is usually minimal.
+
+   By default (``0``), the results of MFMA instruction are converted to blocked layout, which leads to ``global_store``
+   with maximum vector length, that is ``global_store_dwordx4``.
+
+   This is done implicitly with LDS as the intermediate buffer to achieve
+   data exchange between threads. Padding is used in LDS to avoid bank
+   conflicts. This usually leads to extra LDS usage, which might reduce
+   occupancy.
+
+   .. note::
+
+      This variable is not turned on by default because it only
+      works with ``tt.store`` but not ``tt.atomic_add``, which is used in split-k and
+      stream-k GEMM kernels. In the future, it might be enabled with
+      ``tt.atomic_add`` and turned on by default.
+
+   See :ref:`IR analysis <fine-tuning-llms-triton-ir-analysis>`.
+
+TorchInductor with Triton tuning knobs
+===========================================
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (``conv``) operations in PyTorch
+using ``inductor``, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better
+performance.
+
+Learn more about TorchInductor environment variables and usage in
+`PyTorch documentation <https://pytorch.org/docs/2.3/torch.compiler_inductor_profiling.html>`_.
+
+To enable a ``gemm``/``conv`` lowering to Triton, it requires use of ``inductor``’s ``max_autotune`` mode. This benchmarks a
+static list of Triton configurations (``conv`` configurations for max auto-tune + ``matmul`` configurations for max
+auto-tune) and uses the fastest for each shape. Note that the Triton is not used if regular :doc:`MIOpen <miopen:index>`
+or :doc:`rocBLAS <rocblas:index>` is faster for a specific operation.
+
+* Set ``torch._inductor.config.max_autotune = True`` or ``TORCHINDUCTOR_MAX_AUTOTUNE=1``.
+
+* Or, for more fine-grained control:
+
+  ``torch._inductor.config.max_autotune.pointwise = True``
+     To enable tuning for ``pointwise``/``reduction`` ops.
+
+  ``torch._inductor.config.max_autotune_gemm = True``
+     To enable tuning or lowering of ``mm``/``conv``\s.
+
+  ``torch._inductor.max_autotune_gemm_backends/TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS``
+     To select the candidate backends for ``mm`` auto-tuning. Defaults to
+     ``TRITON,ATEN,NV``. This also includes the ``CUTLASS`` tuning option. Limiting this to
+     ``TRITON`` might improve performance by enabling more fused ``mm`` kernels
+     instead of going to rocBLAS.
+
+* For ``mm`` tuning, tuning ``coordinate_descent`` might improve performance.
+
+  ``torch._inductor.config.coordinate_descent_tuning = True`` or ``TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1``
+
+* Inference can see large improvements on AMD GPUs by utilizing
+  ``torch._inductor.config.freezing=True`` or the ``TORCHINDUCTOR_FREEZING=1`` variable, which
+  in-lines weights as constants and enables constant folding optimizations.
+
+* Enabling ``inductor``’s cpp_wrapper might improve overhead. This generates
+  C++ code which launches Triton binaries directly with
+  ``hipModuleLaunchKernel`` and relies on `hipification`.
+
+* For NHWC convolutions workloads
+  ``torch._inductor.config.layout_optimization=True`` or ``TORCHINDUCTOR_LAYOUT_OPTIMIZATION=``
+  can help be enforcing channels_last format throughout the graph avoiding
+  any additional transposes added by ``inductor``. Note that
+  ``PYTORCH_MIOPEN_SUGGEST_NHWC=1`` is recommended if using this.
+
+* Extracting the Triton kernel ``TORCH_COMPILE_DEBUG`` creates a
+  ``torch_compile_debug/`` directory at current path, in the ``output_code.py``
+  the code-strings for the Triton kernels that are defined. Manual work is
+  then required to strip out the kernel and create kernel
+  compilation and launch via Triton.
+
+Other guidelines
+================
+
+* Performance-critical HIP provides an environment variable, ``export HIP_FORCE_DEV_KERNARG=1``,
+  that can put HIP kernel arguments directly to
+  device memory to reduce the latency of accessing kernel arguments. It
+  can reduce 2 to 3 μs for some kernels. Setting this variable for the FA
+  decode containing ``splitK`` and reduced kernels can reduce the total time
+  by around 6 μs in the benchmark test.
+
+* Set the clock to deterministic. Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed to
+  1900MHz instead of the default 2100MHz. This can reduce the chance of clock speed decrease due to chip high temperature
+  by setting a lower cap. You can restore this setting to its default value with ``rocm-smi -r``.
+
+* Set Non-Uniform Memory Access (NUMA) auto-balance. Run the command ``cat /proc/sys/kernel/numa_balancing`` to check the
+  current setting. An output of ``0`` indicates this setting is available. If output is ``1``, run the command
+  ``sudo sh -c \\'echo 0 > /proc/sys/kernel/numa_balancing`` to set this.
+
+For these settings, the ``env_check.sh`` script automates the setting, resetting, and checking of the such
+environments. Find the script at `<https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh>`__.
+
+.. _fine-tuning-llms-triton-tunableop:
+
+TunableOp
+---------
+`TunableOp <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/tunable/README.md>`_
+is a feature used to define and optimize kernels that can have tunable parameters. This is useful in
+optimizing the performance of custom kernels by exploring different parameter configurations to find the most efficient
+setup. See more about PyTorch TunableOp :ref:`Model acceleration libraries <fine-tuning-llms-pytorch-tunableop>`.
+
+You can easily manipulate the behavior TunableOp through environment variables, though you could use the C++ interface
+``at::cuda::tunable::getTuningContext()``. A Python interface to the ``TuningContext`` does not yet exist.
+
+The default value is ``0``, which means only 1 iteration is attempted. Remember: there’s an overhead to tuning. To try
+and minimize the overhead, only a limited number of iterations of a given operation are attempted. If you set this to
+``10``, each solution for a given operation can run as many iterations as possible within 10ms. There is a hard-coded
+upper limit of 100 iterations attempted per solution. This is a tuning parameter; if you want the tunings to be chosen
+based on an average over multiple iterations, increase the allowed tuning duration.
--- a/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
@@ -0,0 +1,484 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="SmoothQuant model inference on AMD Instinct MI300X using Composable Kernel">
+  <meta name="keywords" content="Mixed Precision, Kernel, Inference, Linear Algebra">
+</head>
+
+# Optimizing with Composable Kernel
+
+The AMD ROCm&trade; Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.
+
+This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.
+
+## High-level overview: a CK GEMM instance
+
+GEMM is a fundamental block in linear algebra, machine learning, and deep neural networks. It is defined as the operation:
+{math}`E = α \times (A \times B) + β \times (D)`, with A and B as matrix inputs, α and β as scalar inputs, and D as a pre-existing matrix.
+Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct accelerators for the computation of GEMM. The implementation of the instance contains two phases:
+
+- [Template parameter definition](#template-parameter-definition)
+- [Instantiating and running the templated kernel](#instantiating-and-running-the-templated-kernel)
+
+### Template parameter definition
+
+The template parameters of the instance are grouped into four parameter types:
+
+- [Parameters for determining matrix data precision](matrix-data-precision)
+- [Parameters for determining matrix data layout](matrix-data-layout)
+- [Parameters for determining extra operations on matrix elements](matrix-element-operation)
+- [Performance-oriented tunable parameters](tunable-parameters)
+
+<!-- 
+================
+ ### Figure 2
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
+The template parameters of the selected GEMM kernel are classified into four groups. These template parameter groups should be defined properly before running the instance.
+```
+
+(matrix-data-precision)=
+
+#### Matrix data precision
+
+A, B, D, and E are defined as half-precision floating-point datatypes. The multiply-add results of matrix A and B are added with a pre-existing matrix D (half-precision), and the final GEMM results are also half-precision floating-points.
+
+```c++
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using EDataType        = F16;
+```
+
+`ADataType` and `BDataType` denote the data precision of the A and B input matrices. `AccDataType` determines the data precision used for representing the multiply-add results of A and B elements. These results are stored in a `CShuffle` module in local data share (LDS), a low-latency and high-bandwidth explicitly-addressed memory used for synchronization within a workgroup LDS for later use.
+
+`CShuffleDataType` denotes the data precision of `CShuffle` in LDS.
+
+`DDataType` denotes the data precision of the pre-existing D matrix stored in GPU global memory, while `EDatatype` denotes the data precision of the final output. The CK kernel supports a fusion strategy so that `CShuffle` can be added with a single pre-existing matrix in the same GPU kernel for better performance.
+
+(matrix-data-layout)=
+
+#### Matrix data layout
+
+```c++
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+```
+
+Following the convention of various linear algebra libraries, CK assumes that the input matrix A is an M x K matrix, meaning the matrix has M rows and K columns. Similarly, matrix B is assumed to be K x N, meaning it has K rows and N columns. In computing, row-major order and column-major order are commonly used ways to store matrices in linear storage. After understanding the matrix storage pattern, the underlying optimized memory access manner can be applied to achieve better performance depending on the storage ordering of these matrices.
+
+(matrix-element-operation)=
+
+#### Matrix element operation
+
+```c++
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+```
+
+CK supports the pre-processing of the matrix before calculating GEMM, that is, `C = AElementOp(A) * BElementOp(B)`. It similarly supports the post-processing of GEMM results the same way, that is, `E = CDEElementOp(C, D)`.
+
+`AElementOp` and `BElementOp` determine the operation applied to matrix A and B separately before GEMM, which is achieved by binding the operation with a C++ struct function.
+
+The above `PassThrough` denotes no operations are performed on the target matrix. `CDEELementOp` determines the operations applied to `CShuffle` output and matrix D. The following binding struct `AddRelu` shows an example of adding the `CShuffle` output and matrix D, and ReLU (Rectified Linear Unit) operations to the addition result. It then passes the results to matrix E.
+
+```c++
+struct AddRelu
+{
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d) const
+    {
+        const ck::half_t x = c + d;
+        e = x > 0 ? x : 0;
+    }
+};
+```
+
+(tunable-parameters)=
+
+#### Tunable parameters  
+
+The CK instance includes a series of tunable template parameters to control the parallel granularity of the workload to achieve load balancing on different hardware platforms.
+
+These parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc.
+
+- Block Size determines the number of threads in the thread block.
+- M/N/K Per Block determines the size of tile that each thread block is responsible for calculating.
+- M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
+- A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will result in compilation errors.
+
+Conditions for achieving computational load balancing on different hardware platforms can vary.
+
+### Instantiating and running the templated kernel
+
+After determining the template parameters, we instantiate the kernel with actual arguments. Do one of the following:
+
+- Use `GetDeviceBuffer` from CK’s custom struct `DeviceMem` to pass the element values of the matrices that need to be calculated.
+- Allocate device buffer via `hipMalloc`. Ensure the device buffer size can fit the matrix size.
+- Pass matrix elements through the `data_ptr` method in the `Tensor` object if the matrix to be calculated is of `Tensor` type.
+
+The row and column, and stride information of input matrices are also passed to the instance. For batched GEMM, you must pass in additional batch count and batch stride values. The extra operations for pre and post-processing are also passed with an actual argument; for example, α and β for GEMM scaling operations. Afterward, the instantiated kernel is launched by the invoker, as illustrated in Figure 3.
+
+<!-- 
+================
+ ### Figure 3
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
+Templated kernel launching consists of kernel instantiation, making arguments by passing in actual application parameters, creating an invoker, and running the instance through the invoker.
+```
+
+## Developing fused INT8 kernels for SmoothQuant models
+
+[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct accelerators are shown in the following table.
+
+:::{table} Functionalities used to implement SmoothQuant model inference.
+
+|       Functionality descriptions     |        Corresponding wrappers           |
+|:-------------------------------------|-----------------------------------------|
+| {math}`E = α \times (A \times B) + β \times (D)`, where A, B, D, E are INT8 2-D tensors; | E = Linear_ABDE_I8(A, B, D, {math}`\alpha`, {math}`\beta`) |
+| {math}`E = RELU (α \times (A \times B) + β \times (D))`, where A, B, D, E are INT8 2-D tensors; | E = Linear_ReLU_ABDE_I8(A, B, D, {math}`\alpha`, {math}`\beta`) |
+| {math}`E = α \times (A \times B) + β \times (D)`, where A, B are INT8 2-D tensors, D and E are FP32 2-D tensors; | E = Linear_AB_I8_DE_F32(A, B, D, {math}`\alpha`, {math}`\beta`) |
+| {math}`E = α \times (A \times B)`, where A, B, E are INT8 3-D tensors; | E = BMM_ABE_I8(A, B, {math}`\alpha`) |
+| {math}`E = α \times (A \times B)`, where A, B are INT8 3-D tensors, E is FP32 3-D tensor; | E = BMM_AB_I8_E_F32(A, B, {math}`\alpha`) |
+:::
+
+### Operation flow analysis
+
+The following section discusses the analysis of the operation flow of `Linear_ReLU_ABDE_I8`. The rest of the wrappers in Table 1 can be analyzed similarly.
+
+The first operation in the process is to perform the multiplication of input matrices A and B. The resulting matrix C is then scaled with α to obtain T1. At the same time, the process performs a scaling operation on D elements to obtain T2. Afterward, the process performs matrix addition between T1 and T2, element activation calculation using ReLU, and element rounding sequentially. The operations to generate E1, E2, and E are encapsulated and completed by a user-defined template function in CK (given in the next sub-section). This template function is integrated into the fundamental instance directly during the compilation phase so that all these steps can be fused in a single GPU kernel.
+
+<!-- 
+================
+ ### Figure 4
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
+Operation flow.
+```
+
+The CK library contains many fundamental instances that implement different functions. Familiarize yourself with the names of various CK instances and determine whether they meet the target functional requirements.
+
+Second, consider whether the format of input data meets your actual calculation needs. For SQ models, the 8-bit integer data format (INT8) is applied for matrix calculations.
+
+Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct accelerators after being compiled and cannot run on Radeon-series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.
+
+Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_kernel/tree/develop/example/24_batched_gemm) as the fundamental instance to implement the functionalities in the previous table.
+
+<!-- 
+================
+ ### Figure 5
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
+Use the ‘DeviceBatchedGemmMultiD_Xdl’ instance as a root.
+```
+
+The `DeviceBatchedGemmMultiD_Xdl` instance realizes the batched GEMM `BMM_ABE_I8` and `BMM_AB_I8_E_F32` kernels directly by using the proper input and output data precision types.
+
+Based on the two batched GEMM kernels, GEMM kernel `Linear_ABDE_I8` and `Linear_AB_I8_DE_F32` can be implemented by expanding their input 2-D tensors to 3-D tensors. Then, the 3-D output tensors produced by the root instance are squeezed back to 2-D output tensors before returning back.
+
+For example, unsqueeze A (M, K) to A (1, M, K) before assigning it into the root instance and squeeze E (1, M, N) to (M, N) after the calculations of the root instance return back. `Linear_ReLU_ABDE_I8` is implemented by adding a ReLU operation on the result output of `Linear_ABDE_I8`.
+
+### Developing the complete function
+
+The inference of SQ quantized models relies on using PyTorch and Transformer libraries, and a tensor type is used to represent matrices and vectors in `torch`, the C++ data types in CK need to be replaced with the `torch::tensor` type. The data types of the input and output matrices should be a `tensor` type.
+
+In GEMM, the A and B inputs are two-dimensional matrices, and the required input matrices of the selected fundamental CK instance are three-dimensional matrices. Therefore, we must convert the input 2-D tensors to 3-D tensors, by using `tensor`'s `unsqueeze()` method before passing these matrices to the instance. For batched GEMM in the preceding table, ignore this step.
+
+```c++
+// Function input and output 
+torch::Tensor linear_relu_abde_i8(
+    torch::Tensor A_,
+    torch::Tensor B_,
+    torch::Tensor D_,
+    float alpha,
+    float beta)
+{
+  // Convert torch::Tensor A_ (M, K) to torch::Tensor A (1, M, K) 
+  auto A = A_.unsqueeze(0);
+
+  // Convert torch::Tensor B_ (K, N) to torch::Tensor A (1, K, N) 
+  auto B = B_.unsqueeze(0);
+...
+```
+
+As shown in the following code block, we obtain M, N, and K values using input tensor size values. This stride size information is used to reshape the input vector D and allocate the storage space of tensor E. Stride reflects the exact size of continuous elements in memory, which are passed as important parameters to the fundamental instance for GPU kernel use.
+
+```c++
+  // Return the batch count from the size of dimension 0
+  int batch_count = A.size(0);
+
+  // Return the M, N, K from the size of dimension 1 & 2
+  int M = A.size(1);
+  int N = B.size(1);
+  int K = A.size(2);
+
+  // Initialize the stride size for A, B, D and E
+  int stride_A = K;
+  int stride_B = K;
+  int stride_D0 = N;
+  int stride_E = N;
+
+  // Initialize the stride size for batched A, B, D and E
+  long long int batch_stride_A = M * K;
+  long long int batch_stride_B = K * N;
+  long long int batch_stride_D0 = M * N;
+  long long int batch_stride_E = M * N;
+
+  // Convert the tensor of 2-D to 3-D
+  auto D = D_.view({1,-1}).repeat({M, 1});
+
+  // Allocate memory for E
+  auto E = torch::empty({batch_count, M, N}, 
+       torch::dtype(torch::kInt8).device(A.device()));
+```
+
+In the following code block, `ADataType`, `BDataType` and `D0DataType` are used to denote the data precision of the input tensors A, B and D, respectively. `EDataType` is used to denote the data precision of output tensor E. These parameters are specified to `I8` data format (8-bit integer data format) to meet the kernel's design requirements.
+
+`AccDataType` determines the data precision used to represent the multiply-add results of A and B elements. Generally, a larger range data type is applied to store the multiply-add results of A and B to avoid result overflow; `I32` is applied in this case. The `CShuffleDataType I32` data type indicates that the multiply-add results continue to be stored in LDS as an `I32` data format. All of this is implemented through the following code block.
+
+```c++
+  // Data precision 
+  using ADataType        = I8;
+  using BDataType        = I8;
+  using AccDataType      = I32;
+  using CShuffleDataType = I32;
+  using D0DataType       = I8;
+  using DsDataType       = ck::Tuple<D0DataType>;
+  using EDataType        = I8;
+```
+
+Following the convention of various linear algebra libraries, row-major and column-major orders are used to denote the ways of storing matrices in linear storage. The advantage of specifying matrix B as column major is that all the relevant matrix elements are stored continuously in GPU global memory when a row in A is multiplied by a column in B, which can help GPU achieve data consistency access to improve access performance.
+
+```c++
+  // Specify tensor order
+  using ALayout  = RowMajor;
+  using BLayout  = ColumnMajor;
+  using D0Layout = RowMajor;
+  using DsLayout = ck::Tuple<D0Layout>;
+  using ELayout  = RowMajor;
+```
+
+In CK, `PassThrough` is a struct denoting if an operation is applied to the tensor it binds to. To fuse the operations between E1, E2, and E introduced in section [Operation flow analysis](#operation-flow-analysis), we define a custom C++ struct, `ScaleScaleAddRelu`, and bind it to `CDEELementOp`. It determines the operations that will be applied to `CShuffle` (A×B results), tensor D, α, and β.
+
+```c++
+  // No operations bound to the elements of A and B 
+  using AElementOp   = PassThrough;
+  using BElementOp   = PassThrough;
+
+  // Operations bound to the elements of C, D and E
+  using CDEElementOp = ScaleScaleAddRelu;
+```
+
+In the binding struct, `operator()` performs an addition operation between `CShuffle` and matrix D, a ReLU operation on the addition results, and a rounding operation on the output elements. It then returns the results to E.
+
+```c++
+struct ScaleScaleAddRelu {
+
+  template <>
+  __host__ __device__ constexpr void
+  operator()<I8, I32, I8>(I8& e, const I32& c, const I8& d) const
+  {
+      // Scale AxB result with alpha
+      const F32 c_scale = ck::type_convert<F32>(c) * alpha;
+
+      // Scale D with beta
+      const F32 d_scale = ck::type_convert<F32>(d) * beta;
+
+      // Perform addition operation
+      F32 temp = c_scale + d_scale;
+      
+      // Perform RELU operation
+      temp = temp > 0 ? temp : 0;
+
+      // Perform rounding operation 
+      temp = temp > 127 ? 127 : temp;
+      
+      // Return to E
+      e = ck::type_convert<I8>(temp);
+  }
+    
+  F32 alpha;
+  F32 beta;
+};
+```
+
+The original input tensors need to be padded to meet GPU tile-based parallelism.
+
+```c++
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+```
+
+The template parameters of the target fundamental instance are initialized with the above parameters and includes default tunable parameters. For specific tuning methods, see [Tunable parameters](#tunable-parameters).
+
+```c++
+using DeviceOpInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl< 
+    // Tensor layout
+    ALayout, BLayout, DsLayout, ELayout, 
+    // Tensor data type
+    ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  
+    // Tensor operation
+    AElementOp,  BElementOp, CDEElementOp,  
+    // Padding strategy  
+    GemmDefault,
+    // Tunable parameters        
+    tunable parameters>;
+```
+
+Return the address of the first element of tensors:
+
+```c++
+ auto A_ref = A.data_ptr<ADataType>();
+ auto B_ref = B.data_ptr<BDataType>();
+ auto D0_ref = D.data_ptr<D0DataType>();
+ auto E_ref = E.data_ptr<EDataType>();
+```
+
+The fundamental instance is then initialized and run with actual arguments:
+
+```c++
+ auto device_op    = DeviceOpInstance{};
+ auto invoker = device_op.MakeInvoker();
+ auto argument = device_op.MakeArgument(
+    A_ref, B_ref, {D0_ref}, E_ref,
+    M, N, K,
+    batch_count,
+    stride_A, stride_B, {stride_D0}, stride_E,
+    batch_stride_A, batch_stride_B, {batch_stride_D0}, batch_stride_E,
+    AElementOp{}, BElementOp{}, CDEElementOp{alpha, beta});
+
+invoker.Run(argument, StreamConfig{nullptr, 0});
+```
+
+The output of the fundamental instance is a calculated batched matrix E (batch, M, N). Before the return, it needs to be converted to a 2-D matrix if a normal GEMM result is required.
+
+```c++
+// Convert (1, M, N) to (M, N) 
+return E.squeeze(0);
+```
+
+### Binding to Python
+
+Since these functions are written in C++ and `torch::Tensor`, you can use `pybind11` to bind the functions and import them as Python modules. For the example, the necessary binding code for exposing the functions in the table spans but a few lines.  
+
+```c++
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
+  m.def("linear_ab_i8_de_f32", &linear_ab_i8_de_f32);
+  m.def("linear_relu_abde_i8", &linear_relu_abde_i8);
+  m.def("linear_abde_i8", &linear_abde_i8);
+  m.def("bmm_abe_i8", &bmm_abe_i8);
+  m.def("bmm_ab_i8_e_f32", &bmm_ab_i8_e_f32);
+}
+```
+
+Build the C++ extension by writing a `setup.py` script that uses `setuptools` to compile the C++ code. A reference implementation of the `setup.py` script is as follows.
+
+```python
+import os
+from setuptools import setup, find_packages
+from torch.utils import cpp_extension
+from torch.utils.cpp_extension import BuildExtension
+
+os.environ["CC"] = "hipcc"
+os.environ["CXX"] = "hipcc"
+
+sources = [
+    'torch_int/kernels/linear.cpp',
+    'torch_int/kernels/bmm.cpp',
+    'torch_int/kernels/pybind.cpp', 
+]
+
+include_dirs = ['torch_int/kernels/include']
+extra_link_args = ['libutility.a']
+extra_compile_args = ['-O3','-DNDEBUG', '-std=c++17', '--offload-arch=gfx942', '-DCK_ENABLE_INT8', '-D__HIP_PLATFORM_AMD__=1']
+
+setup(
+    name='torch_int',
+    ext_modules=[
+        cpp_extension.CUDAExtension(
+            name='torch_int.rocm',
+            sources=sources,
+            include_dirs=include_dirs,
+            extra_link_args=extra_link_args,
+            extra_compile_args=extra_compile_args
+            ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension.with_options(use_ninja=False)
+    },
+    packages=find_packages(
+        exclude=['notebook', 'scripts', 'tests']),
+)
+```
+
+Run `python setup.py install` to build and install the extension. It should look something like Figure 6:
+
+<!-- 
+================
+ ### Figure 6
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
+Compilation and installation of the INT8 kernels.
+```
+
+### INT8 model inference and performance
+
+The implementation architecture of running SmoothQuant models on MI300X GPUs is illustrated in Figure 7, where (a) shows the decoder layer composition components of the target model, (b) shows the major implementation class for the decoder layer components, and \(c\) denotes the underlying GPU kernels implemented by CK instance.
+
+<!-- 
+================
+ ### Figure 7
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
+The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
+```
+
+For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-smoothquant), each decoder layer contains three major components: attention calculation, layer normalization, and linear transformation in fully connected layers.  The corresponding implementation classes for these components are:
+
+- `Int8OPTAttention`
+- `W8A8B8O8LinearReLU`
+- `W8A8BF32OF32Linear`
+
+ These classes' underlying implementation logits will harness the functions in previous table. Note that for the example, the `LayerNormQ` module is implemented by the torch native module.
+
+Testing environment:
+The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X accelerators and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
+[`rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2`](https://hub.docker.com/layers/rocm/pytorch/rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2/images/sha256-f6ea7cee8aae299c7f6368187df7beed29928850c3929c81e6f24b34271d652b)
+
+The tested models are OPT-1.3B, 2.7B, 6.7B and 13B FP16 models and the corresponding SmoothQuant INT8 OPT models were obtained from Hugging Face.
+
+Note that since the default values were used for the tunable parameters of the fundamental instance, the performance of the INT8 kernel is suboptimal.
+
+Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.
+
+<!-- 
+================
+ ### Figure 8
+================ -->
+```{figure} ../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
+Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
+```
+
+For accuracy comparisons between the original FP16 and INT8 models, the evaluation is done by using the first 1,000 samples from the LAMBADA dataset's validation set. We employ the same Last Token Prediction Accuracy method introduced in [SmoothQuant Real-INT8 Inference for PyTorch](https://github.com/mit-han-lab/smoothquant/blob/main/examples/smoothquant_opt_real_int8_demo.ipynb) as our evaluation metric. The comparison results are shown in Table 2.
+
+:::{table} The inference accuracy comparisons of SmoothQuant quantized models on Instinct MI300X.
+
+|       Models     |    Hugging Face FP16 model accuracy     |   SmoothQuant quantized INT8 model accuracy |
+|:-----------------|----------------------------------------|---------------------------------------------|
+| opt-1.3B | 0.72 | 0.70 |
+| opt-2.7B | 0.76 | 0.75 |
+| opt-6.7B | 0.80 | 0.79 |
+| opt-13B | 0.79 | 0.77 |
+:::
+
+## Conclusion
+
+CK provides a rich set of template parameters for generating flexible accelerated computing kernels for difference application scenarios.
+
+CK supports multiple instruction sets of AMD Instinct GPUs, operator fusion and different data precisions. Its composability helps users quickly construct operator performance verification.
+
+With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD accelerator platforms.
--- a/docs/how-to/llm-fine-tuning-optimization/overview.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/overview.rst
@@ -0,0 +1,104 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, optimzation, LoRA, walkthrough
+
+***************************************
+Conceptual overview of fine-tuning LLMs
+***************************************
+
+Large language models (LLMs) are trained on massive amounts of text data to generate coherent and fluent text. The
+underlying *transformer* architecture is the fundamental building block of all LLMs. Transformers 
+enable LLMs to understand and generate text by capturing contextual relationships and long-range dependencies. To better
+understand the philosophy of the transformer architecture, review the foundational
+`Attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ paper.
+
+By further training pre-trained LLMs, the fine-tuned model can gain knowledge related to specific fields or tasks,
+thereby significantly improving its performance in that field or task. The core idea of fine-tuning is to use the
+parameters of the pre-trained model as the starting point for new tasks and shape it through a small amount of
+specific domain or task data, expanding the original model's capability to new tasks or datasets.
+
+Fine-tuning can effectively improve the performance of existing pre-trained models in specific application scenarios.
+Continuous training and adjustment of the parameters of the base model in the target domain or task can better capture
+the semantic characteristics and patterns in specific scenarios, thereby significantly improving the key indicators of
+the model in that domain or task. For example, by fine-tuning the Llama 2 model, its performance in certain applications
+can be improve over the base model.
+
+.. _fine-tuning-llms-concept-challenge:
+
+The challenge of fine-tuning models
+===================================
+
+However, the computational cost of fine-tuning is still high, especially for complex models and large datasets, which
+poses distinct challenges related to substantial computational and memory requirements. This might be a barrier for
+accelerators or GPUs with low computing power or limited device memory resources.
+
+For example, suppose we have a language model with 7 billion (7B) parameters, represented by a weight matrix :math:`W`.
+During backpropagation, the model needs to learn a :math:`ΔW` matrix, which updates the original weights to minimize the
+value of the loss function.
+
+The weight update is as follows: :math:`W_{updated} = W + ΔW`.
+
+If the weight matrix :math:`W` contains 7B parameters, then the weight update matrix :math:`ΔW` should also
+contain 7B parameters. Therefore, the :math:`ΔW` calculation is computationally and memory intensive.
+
+.. figure:: ../../data/how-to/llm-fine-tuning-optimization/weight-update.png
+   :alt: Weight update diagram
+
+   (a) Weight update in regular fine-tuning. (b) Weight update in LoRA where the product of matrix A (:math:`M\times K`)
+   and matrix B (:math:`K\times N`) is :math:`ΔW(M\times N)`; dimension K is a hyperparameter. By representing
+   :math:`ΔW` as the product of two smaller matrices (A and B) with a lower rank K, the number of trainable parameters
+   is significantly reduced.
+
+.. _fine-tuning-llms-concept-optimizations:
+
+Optimizations for model fine-tuning
+===================================
+
+Low-Rank Adaptation (LoRA) is a technique allowing fast and cost-effective fine-tuning of state-of-the-art LLMs that can
+overcome this issue of high memory consumption.
+
+LoRA accelerates the adjustment process and reduces related memory costs. To be precise, LoRA decomposes the portion of
+weight changes :math:`ΔW` into high-precision low-rank representations, which do not require the calculations of all
+:math:`ΔW`. It learns the decomposition representation of :math:`ΔW` during training, as shown in
+the :ref:`weight update diagram <fine-tuning-llms-concept-challenge>`. This is how LoRA saves on
+computing resources.
+
+LoRA is integrated into the `Hugging Face Parameter-Efficient Fine-Tuning (PEFT)
+<https://huggingface.co/docs/peft/en/index>`_ library, as well as other computation and memory efficiency optimization
+variants for model fine-tuning such as `AdaLoRA <https://huggingface.co/docs/peft/en/package_reference/adalora>`_. This
+library efficiently adapts large pre-trained models to various downstream applications without fine-tuning all model
+parameters. PEFT methods only fine-tune a few model parameters, significantly decreasing computational and storage
+costs while yielding performance comparable to a fully fine-tuned model. PEFT is integrated with the `Hugging Face
+Transformers <https://huggingface.co/docs/transformers/en/index>`_ library, providing a faster and easier way to load,
+train, and use large models for inference.
+
+To simplify running a fine-tuning implementation, the `Transformer Reinforcement Learning (TRL)
+<https://huggingface.co/docs/trl/en/index>`_ library provides a set of tools to train transformer language models with
+reinforcement learning, from the Supervised Fine-Tuning step (SFT), Reward Modeling step (RM), to the Proximal Policy
+Optimization (PPO) step. The ``SFTTrainer`` API in TRL encapsulates these PEFT optimizations so you can easily import
+their custom training configuration and run the training process.
+
+.. _fine-tuning-llms-walkthrough-desc:
+
+Walkthrough
+===========
+
+To demonstrate the benefits of LoRA and the ideal compute compatibility of using PEFT and TRL libraries on AMD
+ROCm-compatible accelerators and GPUs, let's step through a comprehensive implementation of the fine-tuning process
+using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X accelerators.
+
+Before starting, review and understand the key components of this walkthrough:
+
+- `Llama 2 <https://huggingface.co/meta-llama>`_: a family of large language models developed and publicly released by
+  Meta. Its variants range in scale from 7 billion to 70 billion parameters.
+
+- Fine-tuning: a critical process that refines LLMs for specialized tasks and optimizes performance.
+
+- LoRA: a memory-efficient implementation of LLM fine-tuning that significantly reduces the number of trainable
+  parameters.
+
+- `SFTTrainer <https://huggingface.co/docs/trl/v0.8.6/en/sft_trainer#supervised-fine-tuning-trainer>`_: an optimized
+  trainer with a simple interface to easily fine-tune pre-trained models with PEFT adapters, for example, LoRA, for
+  memory efficiency purposes on a custom dataset.
+
+Continue the walkthrough in :doc:`Fine-tuning and inference <fine-tuning-and-inference>`.
--- a/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
@@ -0,0 +1,217 @@
+.. meta::
+   :description: How to fine-tune LLMs with ROCm
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, profiling, debugging, performance, Triton
+
+***********************
+Profiling and debugging
+***********************
+
+This section discusses profiling and debugging tools and some of their common usage patterns with ROCm applications.
+
+PyTorch Profiler
+================
+
+`PyTorch Profiler <https://pytorch.org/docs/stable/profiler.html>`_ can be invoked inside Python scripts, letting you
+collect CPU and GPU performance metrics while the script is running. See the `PyTorch Profiler tutorial
+<https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`_ for more information.
+
+You can then visualize and view these metrics using an open-source profile visualization tool like
+`Perfetto UI <https://ui.perfetto.dev>`_.
+
+#. Use the following snippet to invoke PyTorch Profiler in your code.
+
+   .. code-block:: python
+
+      import torch
+      import torchvision.models as models
+      from torch.profiler import profile, record_function, ProfilerActivity
+      model = models.resnet18().cuda()
+      inputs = torch.randn(2000, 3, 224, 224).cuda()
+      
+      with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+          with record_function("model_inference"):
+              model(inputs)
+      prof.export_chrome_trace("resnet18_profile.json")
+
+#. Profile results in ``resnet18_profile.json`` can be viewed by the Perfetto visualization tool. Go to
+   `<https://ui.perfetto.dev>`__ and import the file. In your Perfetto visualization, you'll see that the upper section
+   shows transactions denoting the CPU activities that launch GPU kernels while the lower section shows the actual GPU
+   activities where it processes the ``resnet18`` inferences layer by layer. 
+
+   .. figure:: ../../data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
+      
+      Perfetto trace visualization example.
+
+ROCm profiling tools
+====================
+
+Heterogenous systems, where programs run on both CPUs and GPUs, introduce additional complexities. Understanding the
+critical path and kernel execution is all the more important; so, performance tuning is a necessary component in the
+benchmarking process.
+
+With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
+using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
+working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these are:
+
+* :ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>`
+* :ref:`Omniperf <fine-tuning-llms-profiling-omniperf>`
+* :ref:`Omnitrace <fine-tuning-llms-profiling-omnitrace>`
+
+.. _fine-tuning-llms-profiling-rocprof:
+
+ROCProfiler
+-----------
+:doc:`ROCProfiler <rocprofiler:index>` is primarily a low-level API for accessing and extracting GPU hardware performance
+metrics, commonly called *performance counters*. These counters quantify the performance of the underlying architecture
+showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.
+
+Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
+available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
+their execution.
+
+This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
+ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.
+
+.. note::
+
+   ``rocprof`` is a CLI-only utility so input and output takes the format of ``.txt`` and CSV files. These
+   formats provide a raw view of the data and puts the onus on the user to parse and analyze. Therefore, ``rocprof``
+   gives the user full access and control of raw performance profiling data, but requires extra effort to analyze the
+   collected data.
+
+.. _fine-tuning-llms-profiling-omniperf:
+
+Omniperf
+--------
+`Omniperf <https://rocm.github.io/omniperf>`_ is a system performance profiler for high-performance computing (HPC) and
+machine learning (ML) workloads using Instinct accelerators. Under the hood, Omniperf uses
+:ref:`ROCProfiler <fine-tuning-llms-profiling-rocprof>` to collect hardware performance counters. The Omniperf tool performs
+system profiling based on all approved hardware counters for Instinct
+accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
+block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.
+
+Omniperf takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
+to collect and analyze raw CSV output files as is the case with ROC-profiler. Instead, Omniperf automates the collection
+of all available hardware counters in one command and provides a graphical interface to help users understand and
+analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.
+
+.. note::
+
+   Omniperf collects hardware counters in multiple passes, and will therefore re-run the application during each pass
+   to collect different sets of metrics.
+
+.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
+
+   Omniperf memory chat analysis panel.
+
+In brief, Omniperf provides details about hardware activity for a particular GPU kernel. It also supports both
+a web-based GUI or command-line analyzer, depending on your preference.
+
+.. _fine-tuning-llms-profiling-omnitrace:
+
+Omnitrace
+---------
+
+`Omnitrace <https://rocm.github.io/omnitrace>`_ is a comprehensive profiling and tracing tool for parallel applications,
+including HPC and ML packages, written in C, C++, Fortran, HIP, OpenCL, and Python which execute on the CPU or CPU and
+GPU. It is capable of gathering the performance information of functions through any combination of binary
+instrumentation, call-stack sampling, user-defined regions, and Python interpreter hooks.
+
+Omnitrace supports interactive visualization of comprehensive traces in the web browser in addition to high-level
+summary profiles with ``mean/min/max/stddev`` statistics. Beyond runtime
+information, Omnitrace supports the collection of system-level metrics such as CPU frequency, GPU temperature, and GPU
+utilization. Process and thread level metrics such as memory usage, page faults, context switches, and numerous other
+hardware counters are also included.
+
+.. tip::
+
+   When analyzing the performance of an application, it is best not to assume you know where the performance
+   bottlenecks are and why they are happening. Omnitrace is the ideal tool for characterizing where optimization would
+   have the greatest impact on the end-to-end execution of the application and to discover what else is happening on the
+   system during a performance bottleneck.
+
+.. figure:: ../../data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
+
+   Omnitrace timeline trace example.
+
+For details usage and examples of using these tools, refer to the
+`Introduction to profiling tools for AMD hardware <https://rocm.blogs.amd.com/software-tools-optimization/profilers/README.html>`_
+developer blog.
+
+Debugging with ROCr Debug Agent
+===============================
+
+:doc:`ROCr Debug Agent <rocr_debug_agent:index>`) is a library that can be loaded by the ROCm platform
+runtime (:doc:`ROCr <rocr-runtime:index>`) to provide the following functionalities for all AMD accelerators and GPUs
+supported by the ROCm Debugger API (:doc:`ROCdbgapi <rocdbgapi:index>`).
+
+* Print the state of all AMD accelerator or GPU wavefronts that caused a queue error; for example, causing a memory
+  violation, executing an ``s_trap2``, or executing an illegal instruction.
+
+* Print the state of all AMD accelerator or GPU wavefronts by sending a ``SIGQUIT`` signal to the process in question;
+  for example, by pressing ``Ctrl + \`` while the process is executing.
+
+Debugging memory access faults
+------------------------------
+
+Identifying a faulting kernel is often enough to triage a memory access fault. To that end, the
+`ROCr Debug Agent <https://github.com/ROCm/rocr_debug_agent/>`_ can trap a memory access fault and provide a dump of all
+active wavefronts that caused the error as well as the name of the kernel. The
+`ROCr Debug Agent Library README <https://github.com/ROCm/rocr_debug_agent/blob/master/README.md>`_ provides full
+instructions, but in brief:
+
+*  Compiling with ``-ggdb -O0`` is recommended but not required.
+
+*  ``HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 HSA_ENABLE_DEBUG=1 ./my_program``
+
+When the debug agent traps the fault, it will produce an extremely
+verbose output of all wavefront registers and memory content.
+Importantly, it also prints something like:
+
+.. code-block:: shell
+
+   Disassembly for function vector_add_assert_trap(int*, int*, int*):
+
+   code object:
+   file:////rocm-debug-agent/build/test/rocm-debug-agent-test#offset=14309&size=31336
+
+   loaded at: [0x7fd4f100c000-0x7fd4f100e070]
+
+The kernel name and the code object file should be listed. In the
+example above, the kernel name is ``vector_add_assert_trap``, but this might
+also look like:
+
+.. code-block:: shell
+
+   Disassembly for function memory:///path/to/codeobject#offset=1234&size=567:
+
+In this case, it is an in-memory kernel that was generated at runtime.
+
+Using the following environment variable, the debug agent will save all code objects to the current directory (use
+``--save-code-objects=[DIR]`` to place them in another location). The code objects will be renamed from the URI format
+with special characters replaced by ``_``. 
+
+.. code-block:: shell
+
+   ROCM_DEBUG_AGENT_OPTIONS="--all --save-code-objects"
+
+Use the ``llvm-objdump`` command to disassemble the indicated in-memory
+code object that has now been saved to disk. The name of the kernel is
+often found inside the disassembled code object.
+
+.. code-block:: shell
+
+   llvm-objdump --disassemble-all path/to/code-object.co
+
+Consider turning off memory caching strategies both within the ROCm
+stack and PyTorch where possible. This will give the debug agent the
+best chance at finding the memory fault where it originates. Otherwise,
+it could be masked by writing past the end of a cached block within a
+larger allocation.
+
+.. code-block:: shell
+
+   PYTORCH_NO_HIP_MEMORY_CACHING=1
+
+   HSA_DISABLE_FRAGMENT_ALLOCATOR=1
+
--- a/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
@@ -0,0 +1,510 @@
+.. meta::
+   :description: Model fine-tuning and inference on a single-GPU system
+   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, single-GPU, LoRA, PEFT, inference
+
+****************************************************
+Fine-tuning and inference using a single accelerator
+****************************************************
+
+This section explains model fine-tuning and inference techniques on a single-accelerator system. See
+:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple accelerators or
+GPUs.
+
+.. _fine-tuning-llms-single-gpu-env:
+
+Environment setup
+=================
+
+This section was tested using the following hardware and software environment.
+
+.. list-table::
+   :stub-columns: 1
+
+   * - Hardware
+     - AMD Instinct MI300X accelerator
+
+   * - Software
+     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
+
+   * - Libraries
+     - ``transformers`` ``datasets`` ``huggingface-hub`` ``peft`` ``trl`` ``scipy``
+
+   * - Base model
+     - ``meta-llama/Llama-2-7b-chat-hf``
+
+.. _fine-tuning-llms-single-gpu-env-setup:
+
+Setting up the base implementation environment
+----------------------------------------------
+
+#. Install PyTorch for ROCm. Refer to the
+   :doc:`PyTorch installation guide <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`. For a consistent
+   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.
+
+#. In the Docker container, check the availability of ROCm-capable accelerators using the following command.
+
+   .. code-block:: shell
+
+      rocm-smi -showproductname
+
+   Your output should look like this:
+
+   .. code-block:: shell
+
+      ============================ ROCm System Management Interface ============================
+      ====================================== Product Info ======================================
+      GPU[0]          : Card series:          AMD Instinct MI300X OAM
+      GPU[0]          : Card model:           0x74a1
+      GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
+      GPU[0]          : Card SKU:             MI3SRIOV
+      ==========================================================================================
+      ================================== End of ROCm SMI Log ===================================
+
+#. Check that your accelerators are available to PyTorch.
+
+   .. code-block:: python
+
+      import torch
+      print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
+      print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
+
+   If successful, your output should look like this:
+
+   .. code-block:: shell
+
+      >>> print("Is a ROCm-GPU detected? ", torch.cuda.is_available())
+      Is a ROCm-GPU detected?  True
+      >>> print("How many ROCm-GPUs are detected? ", torch.cuda.device_count())
+      How many ROCm-GPUs are detected?  4
+
+#. Install the required dependencies.
+
+   bitsandbytes is a library that facilitates quantization to improve the efficiency of deep learning models. Learn more
+   about its use in :doc:`model-quantization`.
+
+   See the :ref:`Optimizations for model fine-tuning <fine-tuning-llms-concept-optimizations>` for a brief discussion on
+   PEFT and TRL.
+
+   .. code-block:: shell
+
+      # Install `bitsandbytes` for ROCm 6.0+.
+      # Use -DBNB_ROCM_ARCH to target a specific GPU architecture.
+      git clone --recurse https://github.com/ROCm/bitsandbytes.git
+      cd bitsandbytes
+      git checkout rocm_enabled
+      pip install -r requirements-dev.txt
+      cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
+      python setup.py install
+      
+      # To leverage the SFTTrainer in TRL for model fine-tuning.
+      pip install trl
+      
+      # To leverage PEFT for efficiently adapting pre-trained language models .
+      pip install peft
+      
+      # Install the other dependencies.
+      pip install transformers, datasets, huggingface-hub, scipy
+
+#. Check that the required packages can be imported.
+
+   .. code-block:: python
+
+      import torch
+      from datasets import load_dataset
+      from transformers import (
+          AutoModelForCausalLM,
+          AutoTokenizer,
+          TrainingArguments
+      )
+      from peft import LoraConfig
+      from trl import SFTTrainer
+
+.. _fine-tuning-llms-single-gpu-download-model-dataset:
+
+Download the base model and fine-tuning dataset
+-----------------------------------------------
+
+#. Request to access to download the `Meta's official Llama model <https://huggingface.co/meta-llama>`_ from Hugging
+   Face. After permission is granted, log in with the following command using your personal access tokens:
+
+   .. code-block:: shell
+
+      huggingface-cli login
+
+   .. note::
+
+      You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_ 
+      as a substitute. It has the same model weights as the original.
+
+#. Run the following code to load the base model and tokenizer.
+
+   .. code-block:: python
+
+      # Base model and tokenizer names.
+      base_model_name = "meta-llama/Llama-2-7b-chat-hf"
+      
+      # Load base model to GPU memory.
+      device = "cuda:0"
+      base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
+      
+      # Load tokenizer.
+      tokenizer = AutoTokenizer.from_pretrained(
+              base_model_name, 
+              trust_remote_code = True)
+      tokenizer.pad_token = tokenizer.eos_token
+      tokenizer.padding_side = "right"
+
+#. Now, let's fine-tune the base model for a question-and-answer task using a small dataset called
+   `mlabonne/guanaco-llama2-1k <https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k>`_, which is a 1000 sample
+   subset of the `timdettmers/openassistant-guanaco <https://huggingface.co/datasets/OpenAssistant/oasst1>`_ dataset.
+
+   .. code-block::
+
+      # Dataset for fine-tuning.
+      training_dataset_name = "mlabonne/guanaco-llama2-1k"
+      training_dataset = load_dataset(training_dataset_name, split = "train")
+      
+      # Check the data.
+      print(training_dataset)
+      
+      # Dataset 11 is a QA sample in English.
+      print(training_dataset[11])
+
+#. With the base model and the dataset, let's start fine-tuning!
+
+.. _fine-tuning-llms-single-gpu-configure-params:
+
+Configure fine-tuning parameters
+--------------------------------
+
+To set up ``SFTTrainer`` parameters, you can use the following code as reference.
+
+.. code-block:: python
+
+   # Training parameters for SFTTrainer.
+   training_arguments = TrainingArguments(
+       output_dir = "./results",
+            num_train_epochs = 1,
+            per_device_train_batch_size = 4,
+            gradient_accumulation_steps = 1,
+            optim = "paged_adamw_32bit",
+            save_steps = 50,
+            logging_steps = 50,
+            learning_rate = 4e-5,
+            weight_decay = 0.001,
+            fp16=False,
+            bf16=False,
+            max_grad_norm = 0.3,
+            max_steps = -1,
+            warmup_ratio = 0.03,
+            group_by_length = True,
+            lr_scheduler_type = "constant",
+            report_to = "tensorboard"
+   )
+
+.. _fine-tuning-llms-single-gpu-start:
+
+Fine-tuning
+===========
+
+In this section, you'll see two ways of training: with the LoRA technique and without. See :ref:`Optimizations for model
+fine-tuning <fine-tuning-llms-concept-optimizations>` for an introduction to LoRA. Training with LoRA uses the
+``SFTTrainer`` API with its PEFT integration. Training without LoRA forgoes these benefits.
+
+Compare the number of trainable parameters and training time under the two different methodologies.
+
+.. tab-set::
+
+   .. tab-item:: Fine-tuning with LoRA and PEFT
+      :sync: with
+
+      1. Configure LoRA using the following code snippet.
+
+         .. code-block:: python
+
+            peft_config = LoraConfig(
+                    lora_alpha = 16,
+                    lora_dropout = 0.1,
+                    r = 64,
+                    bias = "none",
+                    task_type = "CAUSAL_LM"
+            )
+            # View the number of trainable parameters.
+            from peft import get_peft_model
+            peft_model = get_peft_model(base_model, peft_config)
+            peft_model.print_trainable_parameters()
+
+         The output should look like this. Compare the number of trainable parameters to that when fine-tuning without
+         LoRA and PEFT.
+
+         .. code-block:: shell
+
+            trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035
+
+      2. Initialize ``SFTTrainer`` with a PEFT LoRA configuration and run the trainer.
+
+         .. code-block:: python
+
+            # Initialize an SFT trainer.
+            sft_trainer = SFTTrainer(
+                    model = base_model,
+                    train_dataset = training_dataset,
+                    peft_config = peft_config,
+                    dataset_text_field = "text",
+                    tokenizer = tokenizer,
+                    args = training_arguments
+            ) 
+            
+            # Run the trainer.
+            sft_trainer.train()
+
+         The output should look like this:
+
+         .. code-block:: shell
+
+            {'loss': 1.5973, 'grad_norm': 0.25271978974342346, 'learning_rate': 4e-05, 'epoch': 0.16}
+            {'loss': 2.0519, 'grad_norm': 0.21817368268966675, 'learning_rate': 4e-05, 'epoch': 0.32}
+            {'loss': 1.6147, 'grad_norm': 0.3046981394290924, 'learning_rate': 4e-05, 'epoch': 0.48}
+            {'loss': 1.4124, 'grad_norm': 0.11534837633371353, 'learning_rate': 4e-05, 'epoch': 0.64}
+            {'loss': 1.5627, 'grad_norm': 0.09108350425958633, 'learning_rate': 4e-05, 'epoch': 0.8}
+            {'loss': 1.417, 'grad_norm': 0.2536439299583435, 'learning_rate': 4e-05, 'epoch': 0.96}
+            {'train_runtime': 197.4947, 'train_samples_per_second': 5.063, 'train_steps_per_second': 0.633, 'train_loss': 1.6194254455566406, 'epoch': 1.0}
+            100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [03:17<00:00,  1.58s/it]
+
+   .. tab-item:: Fine-tuning without LoRA and PEFT
+      :sync: without
+
+      1. Use the following code to get started.
+
+         .. code-block:: python
+
+            def print_trainable_parameters(model):
+                # Prints the number of trainable parameters in the model.
+                trainable_params = 0
+                all_param = 0
+                for _, param in model.named_parameters():
+                    all_param += param.numel()
+                    if param.requires_grad:
+                        trainable_params += param.numel()
+                print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
+            
+            sft_trainer.peft_config = None
+            print_trainable_parameters(sft_trainer.model)
+
+         The output should look like this. Compare the number of trainable parameters to that when fine-tuning with LoRA
+         and PEFT.
+
+         .. code-block:: shell
+
+            trainable params: 6,738,415,616 || all params: 6,738,415,616 || trainable%: 100.00
+
+
+      2. Run the trainer.
+
+         .. code-block:: python
+
+            # Trainer without LoRA config.
+            trainer_full = SFTTrainer(
+                    model = base_model,
+                    train_dataset = training_dataset,
+                    dataset_text_field = "text",
+                    tokenizer = tokenizer,
+                    args = training_arguments
+            ) 
+            
+            # Training.
+            trainer_full.train()
+
+         The output should look like this:
+
+         .. code-block:: shell
+
+            {'loss': 1.5975, 'grad_norm': 0.25113457441329956, 'learning_rate': 4e-05, 'epoch': 0.16}
+            {'loss': 2.0524, 'grad_norm': 0.2180655151605606, 'learning_rate': 4e-05, 'epoch': 0.32}
+            {'loss': 1.6145, 'grad_norm': 0.2949850261211395, 'learning_rate': 4e-05, 'epoch': 0.48}
+            {'loss': 1.4118, 'grad_norm': 0.11036080121994019, 'learning_rate': 4e-05, 'epoch': 0.64}
+            {'loss': 1.5595, 'grad_norm': 0.08962831646203995, 'learning_rate': 4e-05, 'epoch': 0.8}
+            {'loss': 1.4119, 'grad_norm': 0.25422757863998413, 'learning_rate': 4e-05, 'epoch': 0.96}
+            {'train_runtime': 419.5154, 'train_samples_per_second': 2.384, 'train_steps_per_second': 0.298, 'train_loss': 1.6171623611450194, 'epoch': 1.0}
+            100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [06:59<00:00,  3.36s/it]
+
+.. _fine-tuning-llms-single-gpu-saving:
+
+Saving adapters or fully fine-tuned models
+------------------------------------------
+
+PEFT methods freeze the pre-trained model parameters during fine-tuning and add a smaller number of trainable
+parameters, namely the adapters, on top of it. The adapters are trained to learn specific task information. The adapters
+trained with PEFT are usually an order of magnitude smaller than the full base model, making them convenient to share,
+store, and load.
+
+.. tab-set::
+
+   .. tab-item:: Saving a PEFT adapter
+      :sync: with
+
+      If you're using LoRA and PEFT, use the following code to save a PEFT adapter to your system once the fine-tuning
+      is completed.
+
+      .. code-block:: python
+
+         # PEFT adapter name.
+         adapter_name = "llama-2-7b-enhanced-adapter"
+         
+         # Save PEFT adapter.
+         sft_trainer.model.save_pretrained(adapter_name)
+
+      The saved PEFT adapter should look like this on your system:
+
+      .. code-block:: shell
+
+         # Access adapter directory.
+         cd llama-2-7b-enhanced-adapter
+         
+         # List all adapter files.
+         README.md  adapter_config.json  adapter_model.safetensors
+
+   .. tab-item:: Saving a fully fine-tuned model
+      :sync: without
+
+      If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code 
+      to save your fine-tuned model to your system.
+
+      .. code-block:: python
+
+         # Fully fine-tuned model name.
+         new_model_name = "llama-2-7b-enhanced"
+         
+         # Save the fully fine-tuned model.
+         full_trainer.model.save_pretrained(new_model_name)
+
+      The saved new full model should look like this on your system:
+
+      .. code-block:: shell
+
+         # Access new model directory.
+         cd llama-2-7b-enhanced
+         
+         # List all model files.
+         config.json                       model-00002-of-00006.safetensors  model-00005-of-00006.safetensors
+         generation_config.json            model-00003-of-00006.safetensors  model-00006-of-00006.safetensors
+         model-00001-of-00006.safetensors  model-00004-of-00006.safetensors  model.safetensors.index.json
+
+.. note::
+
+   PEFT adapters can’t be loaded by ``AutoModelForCausalLM`` from the Transformers library as they do not contain
+   full model parameters and model configurations, for example, ``config.json``. To use it as a normal transformer
+   model, you need to merge them into the base model.
+
+Basic model inference
+=====================
+
+A trained model can be classified into one of three types:
+
+*  A PEFT adapter
+
+*  A pre-trained language model in Hugging Face
+
+*  A fully fine-tuned model not using PEFT
+
+Let's look at achieving model inference using these types of models.
+
+.. tab-set::
+
+   .. tab-item:: Inference using PEFT adapters
+
+      To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT 
+      adapters as follows.
+
+      .. code-block:: python
+
+         from peft import PeftModel
+         from transformers import AutoModelForCausalLM
+         
+         # Set the path of the model or the name on Hugging face hub
+         base_model_name = "meta-llama/Llama-2-7b-chat-hf"
+         
+         # Set the path of the adapter
+         adapter_name = "Llama-2-7b-enhanced-adpater"
+         
+         # Load base model 
+         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
+         
+         # Adapt the base model with the adapter 
+         new_model = PeftModel.from_pretrained(base_model, adapter_name)
+         
+         # Then, run generation as the same with a normal model outlined in 2.1
+
+      The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
+      needed if someone wants to save the adapted model into local storage and use it as a normal standalone model.
+
+      .. code-block:: python
+
+         # Load base model 
+         base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
+         
+         # Adapt the base model with the adapter 
+         new_model = PeftModel.from_pretrained(base_model, adapter_name)
+         
+         # Merge adapter 
+         model = model.merge_and_unload()
+
+         # Save the merged model into local
+         model.save_pretrained("merged_adpaters")
+
+   .. tab-item:: Inference using pre-trained or fully fine-tuned models
+
+      If you have a fully fine-tuned model not using PEFT, you can load it like any other pre-trained language model in
+      `Hugging Face Hub <https://huggingface.co/docs/hub/en/index>`_ using the `Transformers
+      <https://huggingface.co/docs/transformers/en/index>`_ library.
+
+      .. code-block:: python
+
+         # Import relevant class for loading model and tokenizer
+         from transformers import AutoTokenizer, AutoModelForCausalLM
+         
+         # Set the pre-trained model name on Hugging face hub
+         model_name = "meta-llama/Llama-2-7b-chat-hf"
+         
+         # Set device type 
+         device = "cuda:0"
+         
+         # Load model and tokenizer 
+         model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
+         tokenizer = AutoTokenizer.from_pretrained(model_name)
+         
+         # Input prompt encoding 
+         query = "What is a large language model?"
+         inputs = tokenizer.encode(query, return_tensors="pt").to(device)
+         
+         # Token generation  
+         outputs = model.generate(inputs) 
+         
+         # Outputs decoding 
+         print(tokenizer.decode(outputs[0]))
+
+      In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
+      sentiment analysis, feature extraction, question answering and so on. You can use the pipeline abstraction to achieve
+      model inference easily.
+
+      .. code-block:: python
+
+         # Import relevant class for loading model and tokenizer
+         from transformers import pipeline
+         
+         # Set the path of your model or the name on Hugging face hub
+         model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
+         
+         # Set pipeline 
+         # A positive device value will run the model on associated CUDA device id
+         pipe = pipeline("text-generation", model=model_name_or_path, device=0)
+         
+         # Token generation
+         print(pipe("What is a large language model?")[0]["generated_text"])
+
+If using multiple accelerators, see
+:ref:`Multi-accelerator fine-tuning and inference <fine-tuning-llms-multi-gpu-hugging-face-accelerate>` to explore
+popular libraries that simplify fine-tuning and inference in a multi-accelerator system.
+
+Read more about inference frameworks like vLLM and Hugging Face TGI in
+:doc:`LLM inference frameworks <llm-inference-frameworks>`.
--- a/docs/how-to/mi300x-workload-tuning.rst
+++ b/docs/how-to/mi300x-workload-tuning.rst
--- a/docs/how-to/rocm-for-ai/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/deploy-your-model.rst
@@ -0,0 +1,113 @@
+.. meta::
+   :description: How to use ROCm for AI
+   :keywords: ROCm, AI, LLM, train, fine-tune, deploy, FSDP, DeepSpeed, LLaMA, tutorial
+
+********************
+Deploying your model
+********************
+
+ROCm enables inference and deployment for various classes of models including CNN, RNN, LSTM, MLP, and transformers.
+This section focuses on deploying transformers-based LLM models.
+
+ROCm supports vLLM and Hugging Face TGI as major LLM-serving frameworks.
+
+.. _rocm-for-ai-serve-vllm:
+
+Serving using vLLM
+==================
+
+vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM officially supports ROCm versions 5.7 and
+6.0. AMD is actively working with the vLLM team to improve performance and support later ROCm versions.
+
+See the `GitHub repository <https://github.com/vllm-project/vllm>`_ and `official vLLM documentation
+<https://docs.vllm.ai/>`_ for more information.
+
+For guidance on using vLLM with ROCm, refer to `Installation with ROCm
+<https://docs.vllm.ai/en/latest/getting_started/amd-installation.html>`_.
+
+vLLM installation
+-----------------
+
+vLLM supports two ROCm-capable installation methods. Refer to the official documentation use the following links.
+
+-  `Build from source with Docker
+   <https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-docker-rocm>`_ (recommended)
+
+-  `Build from source <https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm>`_
+
+vLLM walkthrough
+----------------
+
+Refer to this developer blog for guidance on serving with vLLM `Inferencing and serving with vLLM on AMD GPUs — ROCm
+Blogs <https://rocm.blogs.amd.com/artificial-intelligence/vllm/README.html>`_
+
+.. _rocm-for-ai-serve-hugging-face-tgi:
+
+Serving using Hugging Face TGI
+==============================
+
+The `Hugging Face Text Generation Inference <https://huggingface.co/docs/text-generation-inference/index>`_
+(TGI) library is optimized for serving LLMs with low latency. Refer to the `Quick tour of TGI
+<https://huggingface.co/docs/text-generation-inference/quicktour>`_ for more details.
+
+TGI installation
+----------------
+
+The easiest way to use Hugging Face TGI with ROCm on AMD Instinct accelerators is to use the official Docker image at
+`<https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference>`__.
+
+TGI walkthrough
+---------------
+
+#. Set up the LLM server.
+
+   Deploy the Llama2 7B model with TGI using the official Docker image.
+
+   .. code-block:: shell
+
+      model=TheBloke/Llama-2-7B-fp16
+      volume=$PWD
+      docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data --name tgi_amd ghcr.io/huggingface/text-generation-inference:1.2-rocm --model-id $model
+
+#. Set up the client.
+
+   a. Open another shell session and run the following command to access the server with the client URL.
+
+   .. code-block:: shell
+
+      curl 127.0.0.1:8080/generate \\
+      -X POST \\
+      -d '{"inputs":"What is Deep
+      Learning?","parameters":{"max_new_tokens":20}}' \\
+      -H 'Content-Type: application/json'
+
+   b. Access the server with request endpoints.
+
+   .. code-block:: shell
+
+      pip install request
+      PYTHONPATH=/usr/lib/python3/dist-packages python requests_model.py
+
+      ``requests_model.py`` should look like:
+
+      .. code-block:: python
+
+         import requests
+
+         headers = {
+           "Content-Type": "application/json",
+         }
+
+         data = {
+            'inputs': 'What is Deep Learning?',
+            'parameters': { 'max_new_tokens': 20 },
+         }
+
+         response = requests.post('http://127.0.0.1:8080/generate', headers=headers, json=data)
+
+         print(response.json())
+
+vLLM and Hugging Face TGI are robust solutions for anyone looking to deploy LLMs for applications that demand high
+performance, low latency, and scalability.
+
+Visit the topics in :doc:`Using ROCm for AI <index>` to learn about other ROCm-aware solutions for AI development.
--- a/docs/how-to/rocm-for-ai/hugging-face-models.rst
+++ b/docs/how-to/rocm-for-ai/hugging-face-models.rst
@@ -0,0 +1,210 @@
+.. meta::
+   :description: How to use ROCm for AI
+   :keywords: ROCm, AI, LLM, Hugging Face, Optimum, Flash Attention, GPTQ, ONNX, tutorial
+
+********************************
+Running models from Hugging Face
+********************************
+
+`Hugging Face <https://huggingface.co>`_ hosts the world’s largest AI model repository for developers to obtain
+transformer models. Hugging Face models and tools significantly enhance productivity, performance, and accessibility in
+developing and deploying AI solutions.
+
+This section describes how to run popular community transformer models from Hugging Face on AMD accelerators and GPUs.
+
+.. _rocm-for-ai-hugging-face-transformers:
+
+Using Hugging Face Transformers
+-------------------------------
+
+First, `install the Hugging Face Transformers library <https://huggingface.co/docs/transformers/en/installation>`_,
+which lets you easily import any of the transformer models into your Python application.
+
+.. code-block:: shell
+
+   pip install transformers
+
+Here is an example of running `GPT2 <https://huggingface.co/openai-community/gpt2>`_:
+
+.. code-block:: python
+
+   from transformers import GPT2Tokenizer, GPT2Model
+
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+
+   model = GPT2Model.from_pretrained('gpt2')
+
+   text = "Replace me with any text you'd like."
+
+   encoded_input = tokenizer(text, return_tensors='pt')
+
+   output = model(**encoded_input)
+
+Mainstream transformer models are regularly tested on supported hardware platforms. Models derived from those core
+models should also function correctly.
+
+Here are some mainstream models to get you started:
+
+- `BERT <https://huggingface.co/bert-base-uncased>`_
+
+- `BLOOM <https://huggingface.co/bigscience/bloom>`_
+
+- `Llama <https://huggingface.co/huggyllama/llama-7b>`_
+
+- `OPT <https://huggingface.co/facebook/opt-66b>`_
+
+- `T5 <https://huggingface.co/t5-base>`_
+
+.. _rocm-for-ai-hugging-face-optimum:
+
+Using Hugging Face with Optimum-AMD
+-----------------------------------
+
+Optimum-AMD is the interface between Hugging Face libraries and the ROCm software stack.
+
+For a deeper dive into using Hugging Face libraries on AMD accelerators and GPUs, refer to the
+`Optimum-AMD <https://huggingface.co/docs/optimum/main/en/amd/amdgpu/overview>`_ page on Hugging Face for guidance on
+using Flash Attention 2, GPTQ quantization and the ONNX Runtime integration.
+
+Hugging Face libraries natively support AMD Instinct accelerators. For other
+:doc:`ROCm-capable hardware <rocm-install-on-linux:reference/system-requirements>`, support is currently not
+validated, but most features are expected to work without issues.
+
+.. _rocm-for-ai-install-optimum-amd:
+
+Installation
+~~~~~~~~~~~~
+
+Install Optimum-AMD using pip.
+
+.. code-block:: shell
+
+   pip install --upgrade --upgrade-strategy eager optimum[amd]
+
+Or, install from source.
+
+.. code-block:: shell
+
+   git clone https://github.com/huggingface/optimum-amd.git
+   cd optimum-amd
+   pip install -e .
+
+.. _rocm-for-ai-flash-attention:
+
+Flash Attention
+---------------
+
+#. Use `the Hugging Face team's example Dockerfile
+   <https://github.com/huggingface/optimum-amd/blob/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile>`_ to use
+   Flash Attention with ROCm.
+
+   .. code-block:: shell
+
+      docker build -f Dockerfile -t transformers_pytorch_amd_gpu_flash .
+      volume=$PWD
+      docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $volume:/workspace --name transformer_amd
+      transformers_pytorch_amd_gpu_flash:latest
+
+#. Use Flash Attention 2 with `Transformers
+   <https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2>`_ by adding the
+   ``use_flash_attention_2`` parameter to ``from_pretrained()``:
+
+   .. code-block:: python
+
+      import torch
+      from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+
+      tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
+
+      with torch.device("cuda"):
+        model = AutoModelForCausalLM.from_pretrained(
+        "tiiuae/falcon-7b",
+        torch_dtype=torch.float16,
+        use_flash_attention_2=True,
+        )
+
+.. _rocm-for-ai-gptq:
+
+GPTQ
+----
+
+To enable `GPTQ <https://arxiv.org/abs/2210.17323>`_, hosted wheels are available for ROCm.
+
+#. First, :ref:`install Optimum-AMD <rocm-for-ai-install-optimum-amd>`.
+
+#. Install AutoGPTQ using pip. Refer to `AutoGPTQ Installation <https://github.com/AutoGPTQ/AutoGPTQ#Installation>`_ for
+   in-depth guidance.
+
+   .. code-block:: shell
+
+      pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/
+
+   Or, to install from source for AMD accelerators supporting ROCm, specify the ``ROCM_VERSION`` environment variable.
+
+   .. code-block:: shell
+
+      ROCM_VERSION=6.1 pip install -vvv --no-build-isolation -e .
+
+
+#. Load GPTQ-quantized models in Transformers using the backend `AutoGPTQ library
+   <https://github.com/PanQiWei/AutoGPTQ>`_:
+
+   .. code-block:: python
+
+      import torch
+      from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+
+      tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-Chat-GPTQ")
+
+      with torch.device("cuda"):
+        model = AutoModelForCausalLM.from_pretrained(
+        "TheBloke/Llama-2-7B-Chat-GPTQ",
+        torch_dtype=torch.float16,
+        )
+
+.. _rocm-for-ai-onnx:
+
+ONNX
+----
+
+Hugging Face Optimum also supports the `ONNX Runtime <https://onnxruntime.ai>`_ integration. For ONNX models, usage is
+straightforward.
+
+#. Specify the provider argument in the ``ORTModel.from_pretrained()`` method:
+
+   .. code-block:: python
+
+      from optimum.onnxruntime import ORTModelForSequenceClassification
+      ..
+      ort_model = ORTModelForSequenceClassification.from_pretrained(
+      ..
+      provider="ROCMExecutionProvider"
+      )
+
+#. Try running a `BERT text classification
+   <https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english>`_ ONNX model with ROCm:
+
+   .. code-block:: python
+
+      from optimum.onnxruntime import ORTModelForSequenceClassification
+      from optimum.pipelines import pipeline
+      from transformers import AutoTokenizer
+      import onnxruntime as ort
+
+      session_options = ort.SessionOptions()
+
+      session_options.log_severity_level = 0
+
+      ort_model = ORTModelForSequenceClassification.from_pretrained(
+         "distilbert-base-uncased-finetuned-sst-2-english",
+         export=True,
+         provider="ROCMExecutionProvider",
+         session_options=session_options
+         )
+
+      tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+
+      pipe = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
+
+      result = pipe("Both the music and visual were astounding, not to mention the actors performance.")
+
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -0,0 +1,23 @@
+.. meta::
+   :description: How to use ROCm for AI
+   :keywords: ROCm, AI, machine learning, LLM, usage, tutorial
+
+*****************
+Using ROCm for AI
+*****************
+
+ROCm offers a suite of optimizations for AI workloads from large language models (LLMs) to image and video detection and
+recognition, life sciences and drug discovery, autonomous driving, robotics, and more. ROCm proudly supports the broader
+AI software ecosystem, including open frameworks, models, and tools.
+
+For more information, see `What is ROCm? <https://rocm.docs.amd.com/en/latest/what-is-rocm.html>`_
+
+In this guide, you'll learn about:
+
+- :doc:`Installing ROCm and machine learning frameworks <install>`
+
+- :doc:`Training a model <train-a-model>`
+
+- :doc:`Running models from Hugging Face <hugging-face-models>`
+
+- :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -0,0 +1,60 @@
+.. meta::
+   :description: How to use ROCm for AI
+   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
+
+.. _rocm-for-ai-install:
+
+***********************************************
+Installing ROCm and machine learning frameworks
+***********************************************
+
+Before getting started, install ROCm and supported machine learning frameworks.
+
+.. grid:: 1
+
+   .. grid-item-card:: Pre-install
+
+      Each release of ROCm supports specific hardware and software configurations. Before installing, consult the
+      :doc:`System requirements <rocm-install-on-linux:reference/system-requirements>` and
+      :doc:`Installation prerequisites <rocm-install-on-linux:how-to/prerequisites>` guides.
+
+If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for Linux
+<rocm-install-on-linux:tutorial/quick-start>`.
+
+If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
+:doc:`Radeon installation instructions <radeon:docs/install/install-radeon>`.
+
+ROCm supports two methods for installation. There is no difference in the final ROCm installation between these two
+methods. You can also opt for :ref:`single-version or multi-version installation
+<rocm-install-on-linux:installation-types>`.
+
+*  :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:how-to/native-install/index>`
+
+*  :doc:`Using the AMDGPU installer <rocm-install-on-linux:how-to/amdgpu-install>`
+
+.. grid:: 1
+
+   .. grid-item-card:: Post-install
+
+      Follow the :doc:`post-installation instructions <rocm-install-on-linux:how-to/native-install/post-install>` to
+      configure your system linker, PATH, and verify the installation.
+
+      If you encounter any issues during installation, refer to the
+      :doc:`Installation troubleshooting <rocm-install-on-linux:how-to/native-install/install-faq>` guide.
+
+Machine learning frameworks
+===========================
+
+ROCm supports popular machine learning frameworks and libraries including `PyTorch
+<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
+<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
+<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
+
+Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
+images with the framework pre-installed.
+
+* :doc:`PyTorch for ROCm <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
+* :doc:`TensorFlow for ROCm <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
+* :doc:`JAX for ROCm <rocm-install-on-linux:how-to/3rd-party/jax-install>`
+
+The sections that follow in :doc:`Training a model <train-a-model>` are geared for a ROCm with PyTorch installation.
--- a/docs/how-to/rocm-for-ai/train-a-model.rst
+++ b/docs/how-to/rocm-for-ai/train-a-model.rst
@@ -0,0 +1,140 @@
+.. meta::
+   :description: How to use ROCm for AI
+   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
+
+****************
+Training a model
+****************
+
+The following is a brief overview of popular component paths per AI development use-case, such as training, LLMs,
+and inferencing.
+
+Accelerating model training
+===========================
+
+To train a large model like GPT2 or Llama 2 70B, a single accelerator or GPU cannot store all the model parameters
+required for training. What if you could convert the single-GPU training code to run on multiple accelerators or GPUs?
+PyTorch offers distributed training solutions to facilitate this.
+
+.. _rocm-for-ai-pytorch-distributed:
+
+PyTorch distributed
+-------------------
+
+As of PyTorch 1.6.0, features in ``torch.distributed`` are categorized into three main components:
+
+- `Distributed data-parallel training
+  <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_ (DDP)
+
+- `RPC-Based distributed training <https://pytorch.org/docs/stable/rpc.html>`_ (RPC)
+
+- `Collective communication <https://pytorch.org/docs/stable/distributed.html>`_
+
+In this guide, the focus is on the distributed data-parallelism strategy as it’s the most popular. To get started with DDP,
+let’s first understand how to coordinate the model and its training data across multiple accelerators or GPUs.
+
+The DDP workflow on multiple accelerators or GPUs is as follows:
+
+#. Split the current global training batch into small local batches on each GPU. For instance, if you have 8 GPUs and
+   the global batch is set at 32 samples, each of the 8 GPUs will have a local batch size of 4 samples.
+
+#. Copy the model to every device so each device can process its local batches independently.
+
+#. Run a forward pass, then a backward pass, and output the gradient of the weights with respect to the loss of the
+   model for that local batch. This happens in parallel on multiple devices.
+
+#. Synchronize the local gradients computed by each device and combine them to update the model weights. The updated
+   weights are then redistributed to each device.
+
+In DDP training, each process or worker owns a replica of the model and processes a batch of data, then the reducer uses
+``allreduce`` to sum up gradients over different workers.
+
+See the following developer blogs for more in-depth explanations and examples.
+
+*  `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`_
+
+*  `Building a decoder transformer model on AMD GPUs — ROCm Blogs
+   <https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
+
+.. _rocm-for-ai-pytorch-fsdp:
+
+PyTorch FSDP
+------------
+
+As noted in :ref:`PyTorch distributed <rocm-for-ai-pytorch-distributed>`, in DDP model weights and optimizer states
+are evenly replicated across all workers. Fully Sharded Data Parallel (FSDP) is a type of data parallelism that shards
+model parameters, optimizer states, and gradients across DDP ranks.
+
+When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes
+the training of some very large models feasible by allowing larger models or batch sizes to fit on-device. However, this
+comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations
+like overlapping communication and computation.
+
+For a high-level overview of how FSDP works, review `Getting started with Fully Sharded Data Parallel
+<https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html#how-fsdp-works>`_.
+
+For detailed training steps, refer to the `PyTorch FSDP examples
+<https://github.com/pytorch/examples/tree/main/distributed/FSDP>`_.
+
+.. _rocm-for-ai-deepspeed:
+
+DeepSpeed
+---------
+
+`DeepSpeed <https://deepspeed.ai>`_ offers system innovations that make large-scale deep learning training effective,
+efficient, and easy to use. Innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, and so on fall under
+the training pillar.
+
+See `Pre-training a large language model with Megatron-DeepSpeed on multiple AMD GPUs — ROCm Blogs
+<https://rocm.blogs.amd.com/artificial-intelligence/megatron-deepspeed-pretrain/README.html>`_ for a detailed example of
+training with DeepSpeed on an AMD accelerator or GPU.
+
+.. _rocm-for-ai-automatic-mixed-precision:
+
+Automatic mixed precision (AMP)
+-------------------------------
+
+As models increase in size, the time and memory needed to train them; that is, their cost also increases. Any measure we
+can take to reduce training time and memory usage through `automatic mixed precision
+<https://pytorch.org/docs/stable/amp.html>`_ (AMP) is highly beneficial for most use cases.
+
+See `Automatic mixed precision in PyTorch using AMD GPUs — ROCm Blogs
+<https://rocm.blogs.amd.com/artificial-intelligence/automatic-mixed-precision/README.html#automatic-mixed-precision-in-pytorch-using-amd-gpus>`_
+for more information about running AMP on an AMD accelerator.
+
+.. _rocm-for-ai-fine-tune:
+
+Fine-tuning your model
+======================
+
+ROCm supports multiple techniques for :ref:`optimizing fine-tuning <fine-tuning-llms-concept-optimizations>`, for
+example, LoRA, QLoRA, PEFT, and FSDP.
+
+Learn more about challenges and solutions for model fine-tuning in :doc:`../llm-fine-tuning-optimization/index`.
+
+The following developer blogs showcase examples of how to fine-tune a model on an AMD accelerator or GPU.
+
+* Fine-tuning Llama2 with LoRA
+
+  * `Fine-tune Llama 2 with LoRA: Customizing a large language model for question-answering — ROCm Blogs
+    <https://rocm.blogs.amd.com/artificial-intelligence/llama2-lora/README.html>`_
+
+* Fine-tuning Llama2 with QLoRA
+
+  * `Enhancing LLM accessibility: A deep dive into QLoRA through fine-tuning Llama 2 on a single AMD GPU — ROCm Blogs
+    <https://rocm.blogs.amd.com/artificial-intelligence/llama2-Qlora/README.html>`_
+
+* Fine-tuning a BERT-based LLM for a text classification task using JAX
+
+  * `LLM distributed supervised fine-tuning with JAX — ROCm Blogs
+    <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
+
+* Fine-tuning StarCoder using PEFT
+
+  * `Instruction fine-tuning of StarCoder with PEFT on multiple AMD GPUs — ROCm Blogs
+    <https://rocm.blogs.amd.com/artificial-intelligence/starcoder-fine-tune/README.html>`_
+
+* Recipes for fine-tuning Llama2 and 3 with ``llama-recipes``
+
+  * `meta-llama/llama-recipes: Scripts for fine-tuning Meta Llama3 with composable FSDP & PEFT methods to cover
+    single/multi-node GPUs <https://github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/finetuning>`_
--- a/docs/how-to/system-debugging.md
+++ b/docs/how-to/system-debugging.md
@@ -5,7 +5,7 @@
  ROCm">
 </head>

-# System debugging guide
+# System debugging

 ## ROCm language and system-level debug, flags, and environment variables

--- a/docs/how-to/tuning-guides.md
+++ b/docs/how-to/tuning-guides.md
@@ -1,107 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="Tuning guides">
-  <meta name="keywords" content="high-performance computing, HPC, Instinct accelerators,
-  Radeon, tuning, tuning guide, AMD, ROCm">
-</head>
-
-# Tuning guides
-
-Use case-specific system setup and tuning guides.
-
-## High-performance computing
-
-High-performance computing (HPC) workloads have unique requirements. The default
-hardware and BIOS configurations for OEM platforms may not provide optimal
-performance for HPC workloads. To enable optimal HPC settings on a per-platform
-and per-workload level, this guide calls out:
-
-* BIOS settings that can impact performance
-* Hardware configuration best practices
-* Supported versions of operating systems
-* Workload-specific recommendations for optimal BIOS and operating system
-  settings
-
-There is also a discussion on the AMD Instinct™ software development
-environment, including information on how to install and run the DGEMM, STREAM,
-HPCG, and HPL benchmarks. This guidance provides a good starting point but is
-not exhaustively tested across all compilers.
-
-Prerequisites to understanding this document and to performing tuning of HPC
-applications include:
-
-* Experience in configuring servers
-* Administrative access to the server's Management Interface (BMC)
-* Administrative access to the operating system
-* Familiarity with the OEM server's BMC (strongly recommended)
-* Familiarity with the OS specific tools for configuration, monitoring, and
-  troubleshooting (strongly recommended)
-
-This document provides guidance on tuning systems with various AMD Instinct™
-accelerators for HPC workloads. This document is not an all-inclusive guide, and
-some items referred to may have similar, but different, names in various OEM
-systems (for example, OEM-specific BIOS settings). This document also provides
-suggestions on items that should be the initial focus of additional,
-application-specific tuning.
-
-This document is based on the AMD EPYC™ 7003-series processor family (former
-codename "Milan").
-
-While this guide is a good starting point, developers are encouraged to perform
-their own performance testing for additional tuning.
-
-:::::{grid} 1 1 2 2
-:gutter: 1
-
-:::{grid-item-card}
-**[AMD Instinct™ MI200](./tuning-guides/mi200)**
-
-This chapter goes through how to configure your AMD Instinct™ MI200 accelerated
-compute nodes to get the best performance out of them.
-
-* [Instruction Set Architecture (ISA)](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
-* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
-
-:::
-
-:::{grid-item-card}
-**[AMD Instinct™ MI100](./tuning-guides/mi100)**
-
-This chapter briefly reviews hardware aspects of the AMD Instinct™ MI100
-accelerators and the CDNA™ 1 architecture that is the foundation of these GPUs.
-
-* [ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
-* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
-
-:::
-
-:::::
-
-## Workstation
-
-Workstation workloads, much like high-performance computing, have a unique set of
-requirements, a blend of both graphics and compute, certification, stability and
-the list continues.
-
-The document covers specific software requirements and processes needed to use
-these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
-(ML).
-
-The main purpose of this document is to help users utilize the RDNA 2 GPUs to
-their full potential.
-
-:::::{grid} 1 1 2 2
-:gutter: 1
-
-:::{grid-item-card}
-**[AMD Radeon™ PRO W6000 and V620](./tuning-guides/w6000-v620)**
-
-This chapter describes the AMD GPUs with RDNA™ 2 architecture, namely AMD Radeon
-PRO W6800 and AMD Radeon PRO V620
-
-* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
-* [White paper](https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf)
-
-:::
-
-:::::
--- a/docs/how-to/tuning-guides.rst
+++ b/docs/how-to/tuning-guides.rst
@@ -0,0 +1,107 @@
+.. meta::
+   :description: AMD hardware optimization for specific workloads
+   :keywords: high-performance computing, HPC, Instinct accelerators, Radeon,
+              AMD, ROCm, system, EPYC, CPU, GPU, BIOS, OS
+
+*******************
+System optimization
+*******************
+
+System administrators can optimize the performance of their AMD hardware
+generally and based on specific workloads and use cases. This section outlines
+recommended system optimization options for AMD accelerators and GPUs, enabling
+administrators to maximize efficiency and performance.
+
+High-performance computing workloads
+====================================
+
+High-performance computing (HPC) workloads have unique requirements that may not
+be fully met by the default hardware and BIOS configurations of OEM platforms.
+To achieve optimal performance for HPC workloads, it is crucial to adjust
+settings at both the platform and workload levels. 
+
+The :ref:`AMD Instinct™ accelerator optimization guides <mi-optimization-guides>`
+in this section describe:
+
+* BIOS settings that can impact performance
+* Hardware configuration best practices
+* Supported versions of operating systems
+* Workload-specific recommendations for optimal BIOS and operating system
+  settings
+
+The guides might also discuss the AMD Instinct software development
+environment, including information on how to install and run the DGEMM, STREAM,
+HPCG, and HPL benchmarks. The guides provide a good starting point but is
+not tested exhaustively across all compilers.
+
+Knowledge prerequisites to better understand the following
+:ref:`Instinct system optimization guides <mi-optimization-guides>` and to
+perform tuning for HPC applications include:
+
+* Experience in configuring servers
+* Administrative access to the server's Management Interface (BMC)
+* Administrative access to the operating system
+* Familiarity with the OEM server's BMC (strongly recommended)
+* Familiarity with the OS specific tools for configuration, monitoring, and
+  troubleshooting (strongly recommended)
+
+While the following guides are a good starting point, developers are encouraged
+to perform their own performance testing for additional tuning per device and
+per workload.
+
+.. _mi-optimization-guides:
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Instinct MI200 <tuning-guides/mi200>`
+
+     - `AMD Instinct MI200 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf>`_
+
+     - `CDNA 2 architecture <https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf>`_
+
+   * - :doc:`AMD Instinct MI100 <tuning-guides/mi100>`
+
+     - `AMD Instinct MI100 instruction set architecture <https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf>`_
+
+     - `CDNA architecture <https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf>`_
+
+Workstation workloads
+=====================
+
+Workstation workloads, much like those for HPC, have a unique set of
+requirements: a blend of both graphics and compute, certification, stability and
+others.
+
+The document covers specific software requirements and processes needed to use
+these GPUs for Single Root I/O Virtualization (SR-IOV) and machine learning
+tasks.
+
+The main purpose of this document is to help users utilize the RDNA™ 2 GPUs to
+their full potential.
+
+.. _rdna-optimization-guides:
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Optimization guide
+
+     - Architecture reference
+
+     - White papers
+
+   * - :doc:`AMD Radeon PRO W6000 and V620 <tuning-guides/w6000-v620>`
+
+     - `AMD RDNA 2 instruction set architecture <https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf>`_
+
+     - `RDNA 2 architecture <https://www.amd.com/system/files/documents/rdna2-explained-radeon-pro-W6000.pdf>`_
+
--- a/docs/how-to/tuning-guides/mi100.md
+++ b/docs/how-to/tuning-guides/mi100.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="MI100 high-performance computing and tuning guide">
-  <meta name="keywords" content="MI100, high-performance computing, HPC, tuning, BIOS
+  <meta name="keywords" content="MI100, high-performance computing, HPC, BIOS
  settings, NBIO, AMD, ROCm">
 </head>

-# MI100 high-performance computing and tuning guide
+# AMD Instinct MI100 system optimization

 ## System settings

--- a/docs/how-to/tuning-guides/mi200.md
+++ b/docs/how-to/tuning-guides/mi200.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="MI200 high-performance computing and tuning guide">
-  <meta name="keywords" content="MI200, high-performance computing, HPC, tuning, BIOS
+  <meta name="keywords" content="MI200, high-performance computing, HPC, BIOS
  settings, NBIO, AMD, ROCm">
 </head>

-# MI200 high-performance computing and tuning guide
+# AMD Instinct MI200 system optimization

 ## System settings

--- a/docs/how-to/tuning-guides/w6000-v620.md
+++ b/docs/how-to/tuning-guides/w6000-v620.md
@@ -1,11 +1,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="description" content="RDNA2 workstation tuning guide">
-  <meta name="keywords" content="RDNA2, workstation tuning, BIOS settings, installation, AMD,
+  <meta name="keywords" content="RDNA2, workstation, BIOS settings, installation, AMD,
  ROCm">
 </head>

-# RDNA2 workstation tuning guide
+# AMD RDNA2 system optimization

 ## System settings

--- a/docs/index.md
+++ b/docs/index.md
@@ -25,81 +25,85 @@ Our documentation is organized into the following categories:
 :class-container: rocm-doc-grid

 :::{grid-item-card}
-:class-card: sd-text-black
 :img-top: ./data/banner-installation.jpg
 :img-alt: Install documentation
 :padding: 2

 * Linux
-  * {doc}`Quick-start (Linux)<rocm-install-on-linux:tutorial/quick-start>`
+  * {doc}`Quick start guide<rocm-install-on-linux:tutorial/quick-start>`
  * {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`
  * {doc}`Package manager integration<rocm-install-on-linux:how-to/native-install/package-manager-integration>`
+  * {doc}`Install Docker containers<rocm-install-on-linux:how-to/docker>`
+  * {doc}`ROCm & Spack<rocm-install-on-linux:how-to/spack>`
 * Windows
  * {doc}`Windows install guide<rocm-install-on-windows:how-to/install>`
  * {doc}`Application deployment guidelines<rocm-install-on-windows:conceptual/deployment-guidelines>`
-* {doc}`Install Docker containers<rocm-install-on-linux:how-to/docker>`
-* {doc}`PyTorch for ROCm<rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
-* {doc}`TensorFlow for ROCm<rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
-* {doc}`MAGMA for ROCm<rocm-install-on-linux:how-to/3rd-party/magma-install>`
-* {doc}`ROCm & Spack<rocm-install-on-linux:how-to/spack>`
+* [Deep learning frameworks](./how-to/deep-learning-rocm.rst)
+  * {doc}`PyTorch for ROCm<rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
+  * {doc}`TensorFlow for ROCm<rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
+  * {doc}`JAX for ROCm<rocm-install-on-linux:how-to/3rd-party/jax-install>`
+  * {doc}`MAGMA for ROCm<rocm-install-on-linux:how-to/3rd-party/magma-install>`
 :::

 :::{grid-item-card}
-:class-card: sd-text-black
 :img-top: ./data/banner-compatibility.jpg
 :img-alt: Compatibility information
 :padding: 2

+* [Compatibility matrix](./compatibility/compatibility-matrix.rst)
 * {doc}`System requirements (Linux)<rocm-install-on-linux:reference/system-requirements>`
 * {doc}`System requirements (Windows)<rocm-install-on-windows:reference/system-requirements>`
 * {doc}`Third-party support<rocm-install-on-linux:reference/3rd-party-support-matrix>`
 * {doc}`User/kernel space<rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`
 * {doc}`Docker<rocm-install-on-linux:reference/docker-image-support-matrix>`
 * [OpenMP](./about/compatibility/openmp.md)
-* [Precision support](./about/compatibility/data-type-support.rst)
+* [Precision support](./compatibility/precision-support.rst)
 * {doc}`ROCm on Radeon GPUs<radeon:index>`
 :::

+<!-- markdownlint-disable MD051 -->
 :::{grid-item-card}
-:class-card: sd-text-black
 :img-top: ./data/banner-reference.jpg
 :img-alt: Reference documentation
 :padding: 2

 * [API libraries](./reference/api-libraries.md)
-  * Artificial intelligence
-  * C++ primitives
-  * Communication
-  * Fast Fourier transforms
-  * HIP
-  * Linear algebra
-  * Random number generators
+  * [Artificial intelligence](#artificial-intelligence-apis)
+  * [C++ primitives](#cpp-primitives)
+  * [Communication](#communication-libraries)
+  * [Math](#math-apis)
+  * [Random number generators](#random-number-apis)
+  * [HIP runtime](#hip-runtime)
 * [Tools](./reference/rocm-tools.md)
-  * Development
-  * Performance analysis
-  * System
-* [GPU architectures](./reference/gpu-arch.rst)
-  * [GPU architecture hardware specification overview](./reference/gpu-arch/gpu-arch-spec-overview.rst)
+  * [Development](#development-tools)
+  * [Performance analysis](#performance-analysis)
+  * [System](#system-tools)
+* [Hardware specifications](./reference/gpu-arch-specs.rst)
 :::
+<!-- markdownlint-enable MD051 -->

 :::{grid-item-card}
-:class-card: sd-text-black
 :img-top: ./data/banner-howto.jpg
 :img-alt: How-to documentation
 :padding: 2

-* [System tuning for various architectures](./how-to/tuning-guides.md)
-  * [MI100](./how-to/tuning-guides/mi100.md)
-  * [MI200](./how-to/tuning-guides/mi200.md)
-  * [RDNA2](./how-to/tuning-guides/w6000-v620.md)
-* [Setting up for deep learning with ROCm](./how-to/deep-learning-rocm.md)
+* [Using ROCm for AI](./how-to/rocm-for-ai/index.rst)
+* [Fine-tuning LLMs and inference optimization](./how-to/llm-fine-tuning-optimization/index.rst)
+* [System optimization](./how-to/tuning-guides.rst)
+  * [AMD Instinct MI200](./how-to/tuning-guides/mi200.md)
+  * [AMD Instinct MI100](./how-to/tuning-guides/mi100.md)
+  * [AMD Instinct RDNA2](./how-to/tuning-guides/w6000-v620.md)
+* [System level debugging](./how-to/system-debugging.md)
 * [GPU-enabled MPI](./how-to/gpu-enabled-mpi.rst)
+* [Using compiler features](./conceptual/compiler-topics.md)
+  * [Using AddressSanitizer](./conceptual/using-gpu-sanitizer.md)
+  * [Compiler disambiguation](./conceptual/compiler-disambiguation.md)
+  * [OpenMP support in ROCm](./about/compatibility/openmp.md)
 * [System level debugging](./how-to/system-debugging.md)
 * [GitHub examples](https://github.com/amd/rocm-examples)
 :::

 :::{grid-item-card}
-:class-card: sd-text-black
 :img-top: ./data/banner-conceptual.jpg
 :img-alt: Conceptual documentation
 :padding: 2
@@ -109,15 +113,12 @@ Our documentation is organized into the following categories:
  * [MI250](./conceptual/gpu-arch/mi250.md)
  * [MI300](./conceptual/gpu-arch/mi300.md)
 * [GPU memory](./conceptual/gpu-memory.md)
-* [Compiler disambiguation](./conceptual/compiler-disambiguation.md)
 * [File structure (Linux FHS)](./conceptual/file-reorg.md)
 * [GPU isolation techniques](./conceptual/gpu-isolation.md)
-* [LLVM ASan](./conceptual/using-gpu-sanitizer.md)
 * [Using CMake](./conceptual/cmake-packages.rst)
 * [ROCm & PCIe atomics](./conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst)
 * [Inception v3 with PyTorch](./conceptual/ai-pytorch-inception.md)
 * [Inference optimization with MIGraphX](./conceptual/ai-migraphx-optimization.md)
-* [OpenMP support in ROCm](./about/compatibility/openmp.md)
 :::

 ::::
--- a/docs/reference/api-libraries.md
+++ b/docs/reference/api-libraries.md
@@ -11,6 +11,8 @@
 ::::{grid} 1 2 2 2
 :class-container: rocm-doc-grid

+(artificial-intelligence-apis)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-ai.jpg
@@ -21,9 +23,13 @@
 * {doc}`MIGraphX <amdmigraphx:index>`
 * {doc}`MIOpen <miopen:index>`
 * {doc}`MIVisionX <mivisionx:doxygen/html/index>`
-* [ROCm Performance Primitives (RPP)](https://rocm.docs.amd.com/projects/rpp/en/latest/)
+* {doc}`rocAL <rocal:index>`
+* {doc}`rocDecode <rocdecode:index>`
+* {doc}`ROCm Performance Primitives (RPP) <rpp:index>`
 :::

+(cpp-primitives)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-cpp-primitives.jpg
@@ -36,6 +42,8 @@
 * {doc}`rocThrust <rocthrust:index>`
 :::

+(communication-libraries)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-communication.jpg
@@ -45,6 +53,8 @@
 * {doc}`RCCL <rccl:index>`
 :::

+(hip-runtime)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-hip.jpg
@@ -55,6 +65,8 @@
 * {doc}`HIPIFY <hipify:index>`
 :::

+(math-apis)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-math.jpg
@@ -65,7 +77,7 @@
 * {doc}`hipBLAS <hipblas:index>` / {doc}`rocBLAS <rocblas:index>`
 * {doc}`hipBLASLt <hipblaslt:index>`
 * {doc}`hipFFT <hipfft:index>` / {doc}`rocFFT <rocfft:index>`
-* [hipfort](https://rocm.docs.amd.com/projects/hipfort/en/latest/)
+* {doc}`hipfort <hipfort:index>`
 * {doc}`hipSOLVER <hipsolver:index>` / {doc}`rocSOLVER <rocsolver:index>`
 * {doc}`hipSPARSE <hipsparse:index>` / {doc}`rocSPARSE <rocsparse:index>`
 * {doc}`hipSPARSELt <hipsparselt:index>`
@@ -74,6 +86,8 @@
 * [Tensile](https://github.com/ROCm/Tensile)
 :::

+(random-number-apis)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-random-number.jpg
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -0,0 +1,756 @@
+.. meta::
+   :description: AMD Instinct™ accelerator, AMD Radeon PRO™, and AMD Radeon™ GPU architecture information
+   :keywords: Instinct, Radeon, accelerator, GCN, CDNA, RDNA, GPU, architecture, VRAM, Compute Units, Cache, Registers, LDS, Register File
+
+Accelerator and GPU hardware specifications
+===========================================
+
+The following tables provide an overview of the hardware specifications for AMD Instinct™ accelerators, and AMD Radeon™ PRO and Radeon™ GPUs.
+
+.. tab-set::
+
+  .. tab-item:: AMD Instinct accelerators
+
+    .. list-table::
+        :header-rows: 1
+        :name: instinct-arch-spec-table
+
+        *
+          - Model
+          - Architecture
+          - LLVM target name
+          - VRAM (GiB)
+          - Compute Units
+          - Wavefront Size
+          - LDS (KiB)
+          - L3 Cache (MiB)
+          - L2 Cache (MiB)
+          - L1 Vector Cache (KiB)
+          - L1 Scalar Cache (KiB)
+          - L1 Instruction Cache (KiB)
+          - VGPR File (KiB)
+          - SGPR File (KiB)
+        *
+          - MI300X
+          - CDNA3
+          - gfx941 or gfx942
+          - 192
+          - 304
+          - 64
+          - 64
+          - 256
+          - 32
+          - 32
+          - 16 per 2 CUs
+          - 64 per 2 CUs
+          - 512
+          - 12.5
+        *
+          - MI300A
+          - CDNA3
+          - gfx940 or gfx942
+          - 128
+          - 228
+          - 64
+          - 64
+          - 256
+          - 24
+          - 32
+          - 16 per 2 CUs
+          - 64 per 2 CUs
+          - 512
+          - 12.5
+        *
+          - MI250X
+          - CDNA2
+          - gfx90a
+          - 128
+          - 220 (110 per GCD)
+          - 64
+          - 64
+          -
+          - 16 (8 per GCD)
+          - 16
+          - 16 per 2 CUs
+          - 32 per 2 CUs
+          - 512
+          - 12.5
+        *
+          - MI250
+          - CDNA2
+          - gfx90a
+          - 128
+          - 208
+          - 64
+          - 64
+          -
+          - 16 (8 per GCD)
+          - 16
+          - 16 per 2 CUs
+          - 32 per 2 CUs
+          - 512
+          - 12.5
+        *
+          - MI210
+          - CDNA2
+          - gfx90a
+          - 64
+          - 104
+          - 64
+          - 64
+          -
+          - 8
+          - 16
+          - 16 per 2 CUs
+          - 32 per 2 CUs
+          - 512
+          - 12.5
+        *
+          - MI100
+          - CDNA
+          - gfx908
+          - 32
+          - 120
+          - 64
+          - 64
+          -
+          - 8
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256 VGPR and 256 AccVGPR
+          - 12.5
+        *
+          - MI60
+          - GCN5.1
+          - gfx906
+          - 32
+          - 64
+          - 64
+          - 64
+          -
+          - 4
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+        *
+          - MI50 (32GB)
+          - GCN5.1
+          - gfx906
+          - 32
+          - 60
+          - 64
+          - 64
+          -
+          - 4
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+        *
+          - MI50 (16GB)
+          - GCN5.1
+          - gfx906
+          - 16
+          - 60
+          - 64
+          - 64
+          -
+          - 4
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+        *
+          - MI25
+          - GCN5.0
+          - gfx900
+          - 16 
+          - 64
+          - 64
+          - 64 
+          -
+          - 4 
+          - 16 
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+        *
+          - MI8
+          - GCN3.0
+          - gfx803
+          - 4
+          - 64
+          - 64
+          - 64
+          -
+          - 2
+          - 16
+          - 16 per 4 CUs
+          - 32 per 4 CUs
+          - 256
+          - 12.5
+        *
+          - MI6
+          - GCN4.0
+          - gfx803
+          - 16
+          - 36
+          - 64
+          - 64
+          -
+          - 2
+          - 16
+          - 16 per 4 CUs
+          - 32 per 4 CUs
+          - 256
+          - 12.5
+
+  .. tab-item:: AMD Radeon PRO GPUs
+
+    .. list-table::
+        :header-rows: 1
+        :name: radeon-pro-arch-spec-table
+
+        *
+          - Model
+          - Architecture
+          - LLVM target name
+          - VRAM (GiB)
+          - Compute Units
+          - Wavefront Size
+          - LDS (KiB)
+          - Infinity Cache (MiB)
+          - L2 Cache (MiB)
+          - Graphics L1 Cache (KiB)
+          - L0 Vector Cache (KiB)
+          - L0 Scalar Cache (KiB)
+          - L0 Instruction Cache (KiB)
+          - VGPR File (KiB)
+          - SGPR File (KiB)
+        *
+          - Radeon PRO W7900
+          - RDNA3
+          - gfx1100
+          - 48
+          - 96
+          - 32
+          - 128
+          - 96
+          - 6
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon PRO W7800
+          - RDNA3
+          - gfx1100
+          - 32
+          - 70
+          - 32
+          - 128
+          - 64
+          - 6
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon PRO W7700
+          - RDNA3
+          - gfx1101
+          - 16
+          - 48
+          - 32
+          - 128
+          - 64
+          - 4
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon PRO W6800
+          - RDNA2
+          - gfx1030
+          - 32
+          - 60
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon PRO W6600
+          - RDNA2
+          - gfx1032
+          - 8
+          - 28
+          - 32
+          - 128
+          - 32
+          - 2
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon PRO V620
+          - RDNA2
+          - gfx1030
+          - 32
+          - 72
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon Pro W5500
+          - RDNA
+          - gfx1012
+          - 8
+          - 22
+          - 32
+          - 128
+          -
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon Pro VII
+          - GCN5.1
+          - gfx906
+          - 16
+          - 60
+          - 64
+          - 64
+          -
+          - 4
+          -
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+
+  .. tab-item:: AMD Radeon GPUs
+
+    .. list-table::
+        :header-rows: 1
+        :name: radeon-arch-spec-table
+
+        *
+          - Model
+          - Architecture
+          - LLVM target name
+          - VRAM (GiB)
+          - Compute Units
+          - Wavefront Size
+          - LDS (KiB)
+          - Infinity Cache (MiB)
+          - L2 Cache (MiB)
+          - Graphics L1 Cache (KiB)
+          - L0 Vector Cache (KiB)
+          - L0 Scalar Cache (KiB)
+          - L0 Instruction Cache (KiB)
+          - VGPR File (KiB)
+          - SGPR File (KiB)
+        *
+          - Radeon RX 7900 XTX
+          - RDNA3
+          - gfx1100
+          - 24
+          - 96
+          - 32
+          - 128
+          - 96
+          - 6
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon RX 7900 XT
+          - RDNA3
+          - gfx1100
+          - 20
+          - 84
+          - 32
+          - 128
+          - 80
+          - 6
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon RX 7900 GRE
+          - RDNA3
+          - gfx1100
+          - 16
+          - 80
+          - 32
+          - 128
+          - 64
+          - 6
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon RX 7800 XT
+          - RDNA3
+          - gfx1101
+          - 16
+          - 60
+          - 32
+          - 128
+          - 64
+          - 4
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon RX 7700 XT
+          - RDNA3
+          - gfx1101
+          - 12
+          - 54
+          - 32
+          - 128
+          - 48
+          - 4
+          - 256
+          - 32
+          - 16
+          - 32
+          - 384
+          - 20
+        *
+          - Radeon RX 7600
+          - RDNA3
+          - gfx1102
+          - 8
+          - 32
+          - 32
+          - 128
+          - 32
+          - 2
+          - 256
+          - 32
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6950 XT
+          - RDNA2
+          - gfx1030
+          - 16
+          - 80
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6900 XT
+          - RDNA2
+          - gfx1030
+          - 16
+          - 80
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6800 XT
+          - RDNA2
+          - gfx1030
+          - 16
+          - 72
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6800
+          - RDNA2
+          - gfx1030
+          - 16
+          - 60
+          - 32
+          - 128
+          - 128
+          - 4
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6750 XT
+          - RDNA2
+          - gfx1031
+          - 12
+          - 40
+          - 32
+          - 128
+          - 96
+          - 3
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6700 XT
+          - RDNA2
+          - gfx1031
+          - 12
+          - 40
+          - 32
+          - 128
+          - 96
+          - 3
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6700
+          - RDNA2
+          - gfx1031
+          - 10
+          - 36
+          - 32
+          - 128
+          - 80
+          - 3
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6650 XT
+          - RDNA2
+          - gfx1032
+          - 8
+          - 32
+          - 32
+          - 128
+          - 32
+          - 2
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6600 XT
+          - RDNA2
+          - gfx1032
+          - 8
+          - 32
+          - 32
+          - 128
+          - 32
+          - 2
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon RX 6600
+          - RDNA2
+          - gfx1032
+          - 8
+          - 28
+          - 32
+          - 128
+          - 32
+          - 2
+          - 128
+          - 16
+          - 16
+          - 32
+          - 256
+          - 20
+        *
+          - Radeon VII
+          - GCN5.1
+          - gfx906
+          - 16
+          - 60
+          - 64
+          - 64 per CU
+          -
+          - 4
+          -
+          - 16
+          - 16 per 3 CUs
+          - 32 per 3 CUs
+          - 256
+          - 12.5
+
+Glossary
+========
+
+For more information about the terms used, see the
+:ref:`specific documents and guides <gpu-arch-documentation>`, or 
+:doc:`Understanding the HIP programming model<hip:understand/programming_model>`.
+
+**LLVM target name**
+
+Argument to pass to clang in `--offload-arch` to compile code for the given
+architecture.
+
+**VRAM**
+
+Amount of memory available on the GPU.
+
+**Compute Units**
+
+Number of compute units on the GPU.
+
+**Wavefront Size**
+
+Amount of work items that execute in parallel on a single compute unit. This
+is equivalent to the warp size in HIP.
+
+**LDS**
+
+The Local Data Share (LDS) is a low-latency, high-bandwidth scratch pad
+memory. It is local to the compute units, and can be shared by all work items
+in a work group. In HIP, the LDS can be used for shared memory, which is
+shared by all threads in a block.
+
+**L3 Cache (CDNA/GCN only)**
+
+Size of the level 3 cache. Shared by all compute units on the same GPU. Caches
+data and instructions. Similar to the Infinity Cache on RDNA architectures.
+
+**Infinity Cache (RDNA only)**
+
+Size of the infinity cache. Shared by all compute units on the same GPU. Caches
+data and instructions. Similar to the L3 Cache on CDNA/GCN architectures.
+
+**L2 Cache**
+
+Size of the level 3 cache. Shared by all compute units on the same GCD. Caches
+data and instructions.
+
+**Graphics L1 Cache (RDNA only)**
+
+An additional cache level that only exists in RDNA architectures. Local to a
+work group processor.
+
+**L1 Vector Cache (CDNA/GCN only)**
+
+Size of the level 1 vector data cache. Local to a compute unit. This is the L0
+vector cache in RDNA architectures.
+
+**L1 Scalar Cache (CDNA/GCN only)**
+
+Size of the level 1 scalar data cache. Usually shared by several compute
+units. This is the L0 scalar cache in RDNA architectures.
+
+**L1 Instruction Cache (CDNA/GCN only)**
+
+Size of the level 1 instruction cache. Usually shared by several compute
+units. This is the L0 instruction cache in RDNA architectures.
+
+**L0 Vector Cache (RDNA only)**
+
+Size of the level 0 vector data cache. Local to a compute unit. This is the L1
+vector cache in CDNA/GCN architectures.
+
+**L0 Scalar Cache (RDNA only)**
+
+Size of the level 0 scalar data cache. Usually shared by several compute
+units. This is the L1 scalar cache in CDNA/GCN architectures.
+
+**L0 Instruction Cache (RDNA only)**
+
+Size of the level 0 instruction cache. Usually shared by several compute
+units. This is the L1 instruction cache in CDNA/GCN architectures.
+
+**VGPR File**
+
+Size of the Vector General Purpose Register (VGPR) file and. It holds data used in
+vector instructions.
+GPUs with matrix cores also have AccVGPRs, which are Accumulation General
+Purpose Vector Registers, used specifically in matrix instructions.
+
+**SGPR File**
+
+Size of the Scalar General Purpose Register (SGPR) file. Holds data used in
+scalar instructions.
+
+**GCD**
+
+Graphics Compute Die.
--- a/docs/reference/gpu-arch.rst
+++ b/docs/reference/gpu-arch.rst
@@ -1,13 +0,0 @@
-.. meta::
-    :description: GPU Architecture reference
-    :keywords: AMD, GPU, architecture, hardware, CDNA, Instinct, reference
-
-.. _gpu-arch-reference:
-
-GPU architecture reference
-##########################
-
-General overview
-""""""""""""""""
-
-* :doc:`GPU architecture hardware specifications overview<gpu-arch/gpu-arch-spec-overview>`
--- a/docs/reference/gpu-arch/gpu-arch-spec-overview.rst
+++ b/docs/reference/gpu-arch/gpu-arch-spec-overview.rst
@@ -1,241 +0,0 @@
-.. meta::
-   :description: AMD Instinct™ GPU architecture information
-   :keywords: Instinct, CDNA, GPU, architecture, VRAM, Compute Units, Cache, Registers, LDS, Register File
-
-GPU architecture hardware specifications
-########################################
-
-The following table provides an overview over the hardware specifications for the AMD Instinct accelerators.
-
-.. list-table:: AMD Instinct architecture specification table
-    :header-rows: 1
-    :name: instinct-arch-spec-table
-
-    *
-      - Model
-      - Architecture
-      - LLVM target name
-      - VRAM
-      - Compute Units
-      - Wavefront Size
-      - LDS
-      - L3 Cache
-      - L2 Cache
-      - L1 Vector Cache
-      - L1 Scalar Cache
-      - L1 Instruction Cache
-      - VGPR File
-      - SGPR File
-    *
-      - MI300X
-      - CDNA3
-      - gfx941 or gfx942
-      - 192 GiB
-      - 304
-      - 64
-      - 64 KiB
-      - 256 MiB
-      - 32 MiB
-      - 32 KiB
-      - 16 KiB per 2 CUs
-      - 64 KiB per 2 CUs
-      - 512 KiB
-      - 12.5 KiB
-    *
-      - MI300A
-      - CDNA3
-      - gfx940 or gfx942
-      - 128 GiB
-      - 228
-      - 64
-      - 64 KiB
-      - 256 MiB
-      - 24 MiB
-      - 32 KiB
-      - 16 KiB per 2 CUs
-      - 64 KiB per 2 CUs
-      - 512 KiB
-      - 12.5 KiB
-    *
-      - MI250X
-      - CDNA2
-      - gfx90a
-      - 128 GiB
-      - 220 (110 per GCD)
-      - 64
-      - 64 KiB
-      -
-      - 16 MiB (8 MiB per GCD)
-      - 16 KiB
-      - 16 KiB per 2 CUs
-      - 32 KiB per 2 CUs
-      - 512 KiB
-      - 12.5 KiB
-    *
-      - MI250
-      - CDNA2
-      - gfx90a
-      - 128 GiB
-      - 208
-      - 64
-      - 64 KiB
-      -
-      - 16 MiB (8 MiB per GCD)
-      - 16 KiB
-      - 16 KiB per 2 CUs
-      - 32 KiB per 2 CUs
-      - 512 KiB
-      - 12.5 KiB
-    *
-       - MI210
-       - CDNA2
-       - gfx90a
-       - 64 GiB
-       - 104
-       - 64
-       - 64 KiB
-       -
-       - 8 MiB
-       - 16 KiB
-       - 16 KiB per 2 CUs
-       - 32 KiB per 2 CUs
-       - 512 KiB
-       - 12.5 KiB
-    *
-      - MI100
-      - CDNA
-      - gfx908
-      - 32 GiB
-      - 120
-      - 64
-      - 64 KiB
-      -
-      - 8 MiB
-      - 16 KiB
-      - 16 KiB per 3 CUs
-      - 32 KiB per 3 CUs
-      - 256 KiB VGPR and 256 KiB AccVGPR
-      - 12.5 KiB
-    *
-      - MI60
-      - GCN 5.1
-      - gfx906
-      - 32 GiB
-      - 64
-      - 64
-      - 64 KiB
-      -
-      - 4 MiB
-      - 16 KiB
-      - 16 KiB per 3 CUs
-      - 32 KiB per 3 CUs
-      - 256 KiB
-      - 12.5 KiB
-    *
-      - MI50 (32GB)
-      - GCN 5.1
-      - gfx906
-      - 32 GiB
-      - 60
-      - 64
-      - 64 KiB
-      -
-      - 4 MiB
-      - 16 KiB
-      - 16 KiB per 3 CUs
-      - 32 KiB per 3 CUs
-      - 256 KiB
-      - 12.5 KiB
-    *
-      - MI50 (16GB)
-      - GCN 5.1
-      - gfx906
-      - 16 GiB
-      - 60
-      - 64
-      - 64 KiB
-      -
-      - 4 MiB
-      - 16 KiB
-      - 16 KiB per 3 CUs
-      - 32 KiB per 3 CUs
-      - 256 KiB
-      - 12.5 KiB
-    *
-      - MI25
-      - GCN 5.0
-      - gfx900
-      - 16 GiB
-      - 64
-      - 64
-      - 64 KiB
-      -
-      - 4 MiB
-      - 16 KiB
-      - 16 KiB per 3 CUs
-      - 32 KiB per 3 CUs
-      - 256 KiB
-      - 12.5 KiB
-    *
-      - MI8
-      - GCN 3.0
-      - gfx803
-      - 4 GiB
-      - 64
-      - 64
-      - 64 KiB
-      -
-      - 2 MiB
-      - 16 KiB
-      - 16 KiB per 4 CUs
-      - 32 KiB per 4 CUs
-      - 256 KiB
-      - 12.5 KiB
-    *
-      - MI6
-      - GCN 4.0
-      - gfx803
-      - 16 GiB
-      - 36
-      - 64
-      - 64 KiB
-      -
-      - 2 MiB
-      - 16 KiB
-      - 16 KiB per 4 CUs
-      - 32 KiB per 4 CUs
-      - 256 KiB
-      - 12.5 KiB
-
-Glossary
-########
-
-For a more detailed explanation refer to the :ref:`specific documents and guides <gpu-arch-documentation>`.
-
-LLVM target name
-  Argument to pass to clang in `--offload-arch` to compile code for the given architecture.
-VRAM
-  Amount of memory available on the GPU.
-Compute Units
-  Number of compute units on the GPU.
-Wavefront Size
-  Amount of work-items that execute in parallel on a single compute unit. This is equivalent to the warp size in HIP.
-LDS
-  The Local Data Share (LDS) is a low-latency, high-bandwidth scratch pad memory. It is local to the compute units, shared by all work-items in a work group. In HIP this is the shared memory, which is shared by all threads in a block.
-L3 Cache
-  Size of the level 3 cache. Shared by all compute units on the same GPU. Caches vector and scalar data and instructions.
-L2 Cache
-  Size of the level 3 cache. Shared by all compute units on the same GCD. Caches vector and scalar data and instructions.
-L1 Vector Cache
-  Size of the level 1 vector data cache. Local to a compute unit. Caches vector data.
-L1 Scalar Cache
-  Size of the level 1 scalar data cache. Usually shared by several compute units. Caches scalar data.
-L1 Instruction Cache
-  Size of the level 1 instruction cache. Usually shared by several compute units.
-VGPR File
-  Size of the Vector General Purpose Register (VGPR) file. Holds data used in vector instructions.
-  GPUs with matrix cores also have AccVGPRs, which are Accumulation General Purpose Vector Registers, specifically used in matrix instructions.
-SGPR File
-  Size of the Scalar General Purpose Register (SGPR) file. Holds data used in scalar instructions.
-GCD
-  Graphics Compute Die.
--- a/docs/reference/rocm-tools.md
+++ b/docs/reference/rocm-tools.md
@@ -11,29 +11,37 @@
 ::::{grid} 1 2 2 2
 :class-container: rocm-doc-grid

+(development-tools)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-development.jpg
 :img-alt: Development tools
 :padding: 2

+* {doc}`HIPIFY <hipify:index>`
 * {doc}`ROCdbgapi <rocdbgapi:index>`
 * [ROCmCC](./rocmcc.md)
-* [ROCm Debug Agent](https://github.com/ROCm/rocr_debug_agent)
-* {doc}`ROCm debugger (ROCgdb) <rocgdb:index>`
+* {doc}`ROCm Debugger (ROCgdb) <rocgdb:index>`
+* {doc}`ROCr Debug Agent <rocr_debug_agent:index>`
 :::

+(performance-tools)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-performance.jpg
 :img-alt: Performance tools
 :padding: 2

-* [RocBandwidthTest](https://github.com/ROCm/rocm_bandwidth_test)
+* {doc}`ROCm Bandwidth Test <rocm_bandwidth_test:index>`
 * {doc}`ROCProfiler <rocprofiler:profiler_home_page>`
+* [rocprofiler-register](https://github.com/ROCm/rocprofiler-register)
 * {doc}`ROCTracer <roctracer:index>`
 :::

+(system-tools)=
+
 :::{grid-item-card}
 :class-card: sd-text-black
 :img-top: ../data/reference/banner-system.jpg
@@ -41,11 +49,10 @@
 :padding: 2

 * {doc}`AMD SMI <amdsmi:index>`
+* {doc}`rocminfo <rocminfo:index>`
 * {doc}`ROCm Data Center Tool <rdc:index>`
-* [ROCm Info](https://github.com/ROCm/rocminfo)
 * {doc}`ROCm SMI <rocm_smi_lib:index>`
 * {doc}`ROCm Validation Suite <rocmvalidationsuite:index>`
-* {doc}`TransferBench <transferbench:index>`
 :::

 ::::
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -8,6 +8,7 @@

 | Version | Release date |
 | ------- | ------------ |
+| [6.1.0](https://rocm.docs.amd.com/en/docs-6.1.0/) | Apr 16, 2024 |
 | [6.0.2](https://rocm.docs.amd.com/en/docs-6.0.2/) | Jan 31, 2024 |
 | [6.0.0](https://rocm.docs.amd.com/en/docs-6.0.0/) | Dec 15, 2023 |
 | [5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) | Oct 13, 2023 |
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -8,7 +8,7 @@ subtrees:
 - entries:
  - file: what-is-rocm.rst
  - file: about/release-notes.md
-    title: Release notes
+    title: Release highlights
    subtrees:
    - entries:
      - file: about/CHANGELOG.md
@@ -22,14 +22,18 @@ subtrees:
    title: ROCm on Linux
  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
    title: HIP SDK on Windows
+  - file: how-to/deep-learning-rocm.md
+    title: Deep learning frameworks

 - caption: Compatibility
  entries:
+  - file: compatibility/compatibility-matrix.rst
+    title: Compatibility matrix
  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/system-requirements.html
    title: Linux
  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
    title: Windows
-  - file: about/compatibility/data-type-support.rst
+  - file: compatibility/precision-support.rst
    title: Precision support
  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/3rd-party-support-matrix.html
    title: Third-party
@@ -40,38 +44,71 @@ subtrees:
      title: API libraries
    - file: reference/rocm-tools.md
      title: Tools
-    - file: reference/gpu-arch.rst
-      title: GPU architectures
-      subtrees:
-      - entries:
-        - file: reference/gpu-arch/gpu-arch-spec-overview.rst
-          title: Hardware specifications overview
+    - file: reference/gpu-arch-specs.rst
+      title: Hardware specifications

- caption: How-to
+- caption: How to
  entries:
-  - file: how-to/deep-learning-rocm.md
-    title: Deep learning
-  - file: how-to/gpu-enabled-mpi.rst
-    title: Using MPI
-  - file: how-to/system-debugging.md
-    title: Debugging
-  - file: how-to/tuning-guides.md
-    title: Tuning guides
+  - file: how-to/rocm-for-ai/index.rst
+    title: Using ROCm for AI
+    subtrees:
+    - entries:
+      - file: how-to/rocm-for-ai/install.rst
+        title: Installation
+      - file: how-to/rocm-for-ai/train-a-model.rst
+      - file: how-to/rocm-for-ai/hugging-face-models.rst
+      - file: how-to/rocm-for-ai/deploy-your-model.rst
+  - file: how-to/llm-fine-tuning-optimization/index.rst
+    title: Fine-tuning LLMs and inference optimization
+    subtrees:
+    - entries:
+      - file: how-to/llm-fine-tuning-optimization/overview.rst
+        title: Conceptual overview
+      - file: how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.rst
+        subtrees:
+        - entries:
+          - file: how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
+            title: Using a single accelerator
+          - file: how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference.rst
+            title: Using multiple accelerators
+      - file: how-to/llm-fine-tuning-optimization/model-quantization.rst
+      - file: how-to/llm-fine-tuning-optimization/model-acceleration-libraries.rst
+      - file: how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+      - file: how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel.md
+        title: Optimizing with Composable Kernel
+      - file: how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.rst
+        title: Optimizing Triton kernels
+      - file: how-to/llm-fine-tuning-optimization/profiling-and-debugging.rst
+  - file: how-to/tuning-guides.rst
+    title: System optimization
    subtrees:
    - entries:
-      - file: how-to/tuning-guides/mi100.md
-        title: MI100
      - file: how-to/tuning-guides/mi200.md
-        title: MI200
+        title: AMD Instinct MI200
+      - file: how-to/tuning-guides/mi100.md
+        title: AMD Instinct MI100
      - file: how-to/tuning-guides/w6000-v620.md
-        title: RDNA2
+        title: AMD RDNA 2
+  - file: how-to/system-debugging.md
+  - file: how-to/gpu-enabled-mpi.rst
+    title: Using MPI
+  - file: conceptual/compiler-topics.md
+    title: Using compiler features
+    subtrees:
+    - entries:
+      - file: conceptual/using-gpu-sanitizer.md
+        title: Using AddressSanitizer
+      - file: conceptual/compiler-disambiguation.md
+        title: Compiler disambiguation
+      - file: about/compatibility/openmp.md
+        title: OpenMP support
  - url: https://github.com/amd/rocm-examples
    title: GitHub examples

 - caption: Conceptual
  entries:
  - file: conceptual/gpu-arch.md
-    title: GPU architectures
+    title: GPU architecture overview
    subtrees:
    - entries:
      - file: conceptual/gpu-arch/mi300.md
@@ -102,16 +139,10 @@ subtrees:
            title: White paper
  - file: conceptual/gpu-memory.md
    title: GPU memory
-  - file: conceptual/compiler-disambiguation.md
-    title: Compiler disambiguation
-  - file: about/compatibility/openmp.md
-    title: OpenMP
  - file: conceptual/file-reorg.md
    title: File structure (Linux FHS)
  - file: conceptual/gpu-isolation.md
    title: GPU isolation techniques
-  - file: conceptual/using-gpu-sanitizer.md
-    title: LLVM ASan
  - file: conceptual/cmake-packages.rst
    title: Using CMake
  - file: conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
@@ -137,4 +168,3 @@ subtrees:
    title: Provide feedback
  - file: about/license.md
    title: ROCm license
-
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1,2 @@
-rocm-docs-core==0.35.1
+rocm-docs-core==1.8.0
+sphinx-reredirects
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -1,114 +1,106 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile requirements.in
 #
-accessible-pygments==0.0.3
+accessible-pygments==0.0.5
    # via pydata-sphinx-theme
-alabaster==0.7.13
+alabaster==1.0.0
    # via sphinx
-babel==2.11.0
+babel==2.16.0
    # via
    #   pydata-sphinx-theme
    #   sphinx
-beautifulsoup4==4.11.2
+beautifulsoup4==4.12.3
    # via pydata-sphinx-theme
-breathe==4.34.0
+breathe==4.35.0
    # via rocm-docs-core
-certifi==2023.7.22
+certifi==2024.8.30
    # via requests
-cffi==1.15.1
+cffi==1.17.1
    # via
    #   cryptography
    #   pynacl
-charset-normalizer==2.1.1
+charset-normalizer==3.3.2
    # via requests
-click==8.1.3
+click==8.1.7
    # via sphinx-external-toc
-cryptography==42.0.4
+cryptography==43.0.1
    # via pyjwt
-deprecated==1.2.13
+deprecated==1.2.14
    # via pygithub
-docutils==0.19
+docutils==0.21.2
    # via
    #   breathe
    #   myst-parser
    #   pydata-sphinx-theme
    #   sphinx
-fastjsonschema==2.16.3
+fastjsonschema==2.20.0
    # via rocm-docs-core
-gitdb==4.0.10
+gitdb==4.0.11
    # via gitpython
-gitpython==3.1.41
+gitpython==3.1.43
    # via rocm-docs-core
-idna==3.4
+idna==3.10
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==7.0.0
-    # via sphinx
-importlib-resources==6.1.1
-    # via rocm-docs-core
-jinja2==3.1.3
+jinja2==3.1.4
    # via
    #   myst-parser
    #   sphinx
-markdown-it-py==2.2.0
+markdown-it-py==3.0.0
    # via
    #   mdit-py-plugins
    #   myst-parser
-markupsafe==2.1.2
+markupsafe==2.1.5
    # via jinja2
-mdit-py-plugins==0.3.4
+mdit-py-plugins==0.4.2
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-parser==1.0.0
+myst-parser==4.0.0
    # via rocm-docs-core
-packaging==23.0
+packaging==24.1
    # via
    #   pydata-sphinx-theme
    #   sphinx
-pycparser==2.21
+pycparser==2.22
    # via cffi
-pydata-sphinx-theme==0.13.3
+pydata-sphinx-theme==0.15.4
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==1.58.1
+pygithub==2.4.0
    # via rocm-docs-core
-pygments==2.15.0
+pygments==2.18.0
    # via
    #   accessible-pygments
    #   pydata-sphinx-theme
    #   sphinx
-pyjwt[crypto]==2.6.0
-    # via
-    #   pygithub
-    #   pyjwt
+pyjwt[crypto]==2.9.0
+    # via pygithub
 pynacl==1.5.0
    # via pygithub
-pytz==2022.7.1
-    # via babel
-pyyaml==6.0
+pyyaml==6.0.2
    # via
    #   myst-parser
    #   rocm-docs-core
    #   sphinx-external-toc
-requests==2.31.0
+requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.35.1
+rocm-docs-core==1.8.0
    # via -r requirements.in
-smmap==5.0.0
+smmap==5.0.1
    # via gitdb
 snowballstemmer==2.2.0
    # via sphinx
-soupsieve==2.4
+soupsieve==2.6
    # via beautifulsoup4
-sphinx==5.3.0
+sphinx==8.0.2
    # via
    #   breathe
    #   myst-parser
@@ -119,35 +111,40 @@ sphinx==5.3.0
    #   sphinx-design
    #   sphinx-external-toc
    #   sphinx-notfound-page
-sphinx-book-theme==1.0.1
+    #   sphinx-reredirects
+sphinx-book-theme==1.1.3
    # via rocm-docs-core
-sphinx-copybutton==0.5.1
+sphinx-copybutton==0.5.2
    # via rocm-docs-core
-sphinx-design==0.4.1
+sphinx-design==0.6.1
    # via rocm-docs-core
-sphinx-external-toc==0.3.1
+sphinx-external-toc==1.0.1
    # via rocm-docs-core
-sphinx-notfound-page==0.8.3
+sphinx-notfound-page==1.0.4
    # via rocm-docs-core
-sphinxcontrib-applehelp==1.0.4
+sphinx-reredirects==0.1.5
+    # via -r requirements.in
+sphinxcontrib-applehelp==2.0.0
    # via sphinx
-sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-devhelp==2.0.0
    # via sphinx
-sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-htmlhelp==2.1.0
    # via sphinx
 sphinxcontrib-jsmath==1.0.1
    # via sphinx
-sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-qthelp==2.0.0
    # via sphinx
-sphinxcontrib-serializinghtml==1.1.5
+sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-typing-extensions==4.5.0
-    # via pydata-sphinx-theme
-urllib3==1.26.13
-    # via requests
-wrapt==1.14.1
-    # via deprecated
-zipp==3.17.0
+tomli==2.0.1
+    # via sphinx
+typing-extensions==4.12.2
    # via
-    #   importlib-metadata
-    #   importlib-resources
+    #   pydata-sphinx-theme
+    #   pygithub
+urllib3==2.2.3
+    # via
+    #   pygithub
+    #   requests
+wrapt==1.16.0
+    # via deprecated
--- a/docs/sphinx/static/css/rocm_custom.css
+++ b/docs/sphinx/static/css/rocm_custom.css
@@ -0,0 +1,6 @@
+
+/* Adds container for big tables, used for Compatibility Matrix */
+
+.format-big-table {
+    white-space: nowrap;
+  }
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -1,6 +1,6 @@
 .. meta::
  :description: What is ROCm
-  :keywords: ROCm projects, introduction, ROCm, AMD, runtimes, compilers, tools, libraries, API
+  :keywords: ROCm components, ROCm projects, introduction, ROCm, AMD, runtimes, compilers, tools, libraries, API

 ***********************************************************
 What is ROCm?
@@ -10,8 +10,13 @@ ROCm is an open-source stack, composed primarily of open-source software, design
 graphics processing unit (GPU) computation. ROCm consists of a collection of drivers, development
 tools, and APIs that enable GPU programming from low-level kernel to end-user applications.

+.. image:: data/rocm-software-stack-6_1_0.jpg
+  :width: 800
+  :alt: AMD's ROCm software stack and neighboring technologies.
+  :align: center
+
 ROCm is powered by
-`Heterogeneous-computing Interface for Portability (HIP) <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_;
+:doc:`Heterogeneous-computing Interface for Portability (HIP) <hip:index>`;
 it supports programming models, such as OpenMP and OpenCL, and includes all necessary open
 source software compilers, debuggers, and libraries. It's fully integrated into machine learning (ML)
 frameworks, such as PyTorch and TensorFlow.
@@ -20,63 +25,109 @@ frameworks, such as PyTorch and TensorFlow.
  If you're using Radeon GPUs, refer to the
  :doc:`Radeon-specific ROCm documentation <radeon:index>`.

-ROCm project list
+ROCm components
 ===============================================

-ROCm consists of the following projects. For information on the license associated with each project,
+ROCm consists of the following components. For information on the license associated with each component,
 see :doc:`ROCm licensing <./about/license>`.

-.. csv-table::
-  :header: "Project", "Type", "Description"
+Libraries
+-----------------------------------------------

-  "`AMD Compute Language Runtimes (CLR) <https://github.com/ROCm/clr>`_", "Runtime", "Contains source code for AMD's compute languages runtimes: :doc:`HIP <hip:index>` and OpenCL"
-  ":doc:`AMD SMI <amdsmi:index>`", "Tool", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
-  "`AOMP <https://github.com/ROCm/aomp/>`_", "Compiler", "Scripted build of `LLVM <https://github.com/ROCm/llvm-project>`_ and supporting software"
-  ":doc:`Composable Kernel <composable_kernel:index>`", "Library (AI/ML)", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
-  "`FLANG <https://github.com/ROCm/flang/>`_", "Compiler", "An out-of-tree Fortran compiler targeting LLVM"
-  "`half <https://github.com/ROCm/half/>`_", "Library (math)", "C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type, along with corresponding arithmetic operators, type conversions, and common mathematical functions"
-  ":doc:`HIP <hip:index>`", "Runtime", AMD's GPU programming language extension and the GPU runtime"
-  ":doc:`hipBLAS <hipblas:index>`", "Library (math)", "BLAS-marshaling library that supports `rocBLAS <https://rocm.docs.amd.com/projects/rocBLAS/en/latest/>`_ and cuBLAS backends"
-  ":doc:`hipBLASLt <hipblaslt:index>`", "Library (math)", "Provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library"
-  "`hipCC <https://github.com/ROCm/HIPCC>`_ ", "Compiler", "Compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure"
-  ":doc:`hipCUB <hipcub:index>`", "Library (C++ primitive)", "Thin header-only wrapper library on top of `rocPRIM <https://rocm.docs.amd.com/projects/rocPRIM/en/latest/>`_ or CUB that allows project porting using the CUB library to the HIP layer"
-  ":doc:`hipFFT <hipfft:index>`", "Library (math)", "Fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends"
-  ":doc:`hipfort <hipfort:index>`", "Library (math)", "Fortran interface library for accessing GPU Kernels"
-  ":doc:`HIPIFY <hipify:index>`", "Compiler", "Translates CUDA source code into portable HIP C++"
-  ":doc:`hipRAND <hiprand:index>`", "Library (math)", "Ports CUDA applications that use the cuRAND library into the HIP layer"
-  ":doc:`hipSOLVER <hipsolver:index>`", "Library (math)", "An LAPACK-marshalling library that supports `rocSOLVER <https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/>`_ and cuSOLVER backends"
-  ":doc:`hipSPARSE <hipsparse:index>`", "Library (math)", "SPARSE-marshalling library that supports `rocSPARSE <https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/>`_ and cuSPARSE backends"
-  ":doc:`hipSPARSELt <hipsparselt:index>`", "Library (math)", "SPARSE-marshalling library with multiple supported backends"
-  ":doc:`hipTensor <hiptensor:index>`", "Library (C++ primitive)", "AMD's C++ library for accelerating tensor primitives based on the composable kernel library"
-  "`LLVM (amdclang) <https://github.com/ROCm/llvm-project>`_ ", "Compiler", "Toolkit for the construction of highly optimized compilers, optimizers, and run-time environments"
-  ":doc:`MIGraphX <amdmigraphx:index>`", "Library (AI/ML)", "Graph inference engine that accelerates machine learning model inference"
-  ":doc:`MIOpen <miopen:index>`", "Library (AI/ML)", "An open source deep-learning library"
-  ":doc:`MIVisionX <mivisionx:doxygen/html/index>`", "Library (AI/ML)", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
-  "`Radeon Compute Profiler (RCP) <https://github.com/GPUOpen-Tools/radeon_compute_profiler/>`_ ", "Tool", "Performance analysis tool that gathers data from the API run-time and GPU for OpenCL and ROCm/HSA applications"
-  ":doc:`RCCL <rccl:index>`", "Library (communication)", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
-  ":doc:`rocAL <rocal:index>`", "Library (AI/ML)", "An augmentation library designed to decode and process images and videos"
-  ":doc:`rocALUTION <rocalution:index>`", "Library (math)", "Sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains"
-  "`RocBandwidthTest <https://github.com/ROCm/rocm_bandwidth_test/>`_ ", "Tool", "Captures the performance characteristics of buffer copying and kernel read/write operations"
-  ":doc:`rocBLAS <rocblas:index>`", "Library (math)", "BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains"
-  ":doc:`rocFFT <rocfft:index>`", "Library (math)", "Software library for computing fast Fourier transforms (FFTs) written in HIP"
-  ":doc:`ROCmCC <./reference/rocmcc>`", "Tool", "Clang/LLVM-based compiler"
-  "`ROCm CMake <https://github.com/ROCm/rocm-cmake>`_ ", "Tool", "Collection of CMake modules for common build and development tasks"
-  ":doc:`ROCm Data Center Tool <rdc:index>`", "Tool", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
-  "`ROCm Debug Agent (ROCdebug-agent) <https://github.com/ROCm/rocr_debug_agent/>`_ ", "Tool", "Prints the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running"
-  ":doc:`ROCm debugger (ROCgdb) <rocgdb:index>`", "Tool", "Source-level debugger for Linux, based on the GNU Debugger (GDB)"
-  ":doc:`ROCdbgapi <rocdbgapi:index>`", "Tool", "ROCm debugger API library"
-  "`rocminfo <https://github.com/ROCm/rocminfo/>`_ ", "Tool", "Reports system information"
-  ":doc:`ROCm Performance Primitives (RPP) <rpp:index>`", "Library (AI/ML)", "Comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends"
-  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "Tool", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
-  ":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Tool", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
-  ":doc:`rocPRIM <rocprim:index>`", "Library (C++ primitive)", "Header-only library for HIP parallel primitives"
-  ":doc:`ROCProfiler <rocprofiler:profiler_home_page>`", "Tool", "Profiling tool for HIP applications"
-  ":doc:`rocRAND <rocrand:index>`", "Library (math)", "Provides functions that generate pseudorandom and quasirandom numbers"
-  "`ROCR-Runtime <https://github.com/ROCm/ROCR-Runtime/>`_ ", "Runtime", "User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents"
-  ":doc:`rocSOLVER <rocsolver:index>`", "Library (math)", "An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD's latest discrete GPUs"
-  ":doc:`rocSPARSE <rocsparse:index>`", "Library (math)", "Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language)"
-  ":doc:`rocThrust <rocthrust:index>`", "Library (C++ primitive)", "Parallel algorithm library"
-  ":doc:`ROCTracer <roctracer:index>`", "Tool", "Intercepts runtime API calls and traces asynchronous activity"
-  ":doc:`rocWMMA <rocwmma:index>`", "Library (math)", "C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations"
-  "`Tensile <https://github.com/ROCm/Tensile>`_ ", "Library (math)", "Creates benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions"
-  ":doc:`TransferBench <transferbench:index>`", "Tool", "Utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs)"
+Machine Learning & Computer Vision
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  ":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
+  ":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
+  ":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
+  ":doc:`MIVisionX <mivisionx:doxygen/html/index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
+  ":doc:`rocAL <rocal:index>`", "An augmentation library designed to decode and process images and videos"
+  ":doc:`rocDecode <rocdecode:index>`", "High-performance SDK for access to video decoding features on AMD GPUs"
+  ":doc:`ROCm Performance Primitives (RPP) <rpp:index>`", "Comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends"
+
+Communication
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  ":doc:`RCCL <rccl:index>`", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
+
+Math
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  "`half <https://github.com/ROCm/half/>`_", "C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type, along with corresponding arithmetic operators, type conversions, and common mathematical functions"
+  ":doc:`hipBLAS <hipblas:index>`", "BLAS-marshaling library that supports :doc:`rocBLAS <rocblas:index>` and cuBLAS backends"
+  ":doc:`hipBLASLt <hipblaslt:index>`", "Provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library"
+  ":doc:`hipFFT <hipfft:index>`", "Fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends"
+  ":doc:`hipfort <hipfort:index>`", "Fortran interface library for accessing GPU Kernels"
+  ":doc:`hipRAND <hiprand:index>`", "Ports CUDA applications that use the cuRAND library into the HIP layer"
+  ":doc:`hipSOLVER <hipsolver:index>`", "An LAPACK-marshalling library that supports :doc:`rocSOLVER <rocsolver:index>` and cuSOLVER backends"
+  ":doc:`hipSPARSE <hipsparse:index>`", "SPARSE-marshalling library that supports :doc:`rocSPARSE <rocsparse:index>` and cuSPARSE backends"
+  ":doc:`hipSPARSELt <hipsparselt:index>`", "SPARSE-marshalling library with multiple supported backends"
+  ":doc:`rocALUTION <rocalution:index>`", "Sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains"
+  ":doc:`rocBLAS <rocblas:index>`", "BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains"
+  ":doc:`rocFFT <rocfft:index>`", "Software library for computing fast Fourier transforms (FFTs) written in HIP"
+  ":doc:`rocRAND <rocrand:index>`", "Provides functions that generate pseudorandom and quasirandom numbers"
+  ":doc:`rocSOLVER <rocsolver:index>`", "An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD's latest discrete GPUs"
+  ":doc:`rocSPARSE <rocsparse:index>`", "Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language)"
+  ":doc:`rocWMMA <rocwmma:index>`", "C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations"
+  "`Tensile <https://github.com/ROCm/Tensile>`_ ", "Creates benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions"
+
+Primitives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  ":doc:`hipCUB <hipcub:index>`", "Thin header-only wrapper library on top of :doc:`rocPRIM <rocprim:index>` or CUB that allows project porting using the CUB library to the HIP layer"
+  ":doc:`hipTensor <hiptensor:index>`", "AMD's C++ library for accelerating tensor primitives based on the composable kernel library"
+  ":doc:`rocPRIM <rocprim:index>`", "Header-only library for HIP parallel primitives"
+  ":doc:`rocThrust <rocthrust:index>`", "Parallel algorithm library"
+
+Tools
+-----------------------------------------------
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  ":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
+  ":doc:`HIPIFY <hipify:index>`", "Translates CUDA source code into portable HIP C++"
+  ":doc:`ROCdbgapi <rocdbgapi:index>`", "ROCm debugger API library"
+  ":doc:`ROCm compilers <./reference/rocmcc>`", "Clang/LLVM-based compiler"
+  ":doc:`rocminfo <rocminfo:index>`", "Reports system information"
+  ":doc:`ROCProfiler <rocprofiler:index>`", "Profiling tool for HIP applications"
+  ":doc:`ROCTracer <roctracer:index>`", "Intercepts runtime API calls and traces asynchronous activity"
+  ":doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`", "Captures the performance characteristics of buffer copying and kernel read/write operations"
+  ":doc:`ROCm CMake <rocmcmakebuildtools:index>`", "Collection of CMake modules for common build and development tasks"
+  ":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
+  ":doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`", "Source-level debugger for Linux, based on the GNU Debugger (GDB)"
+  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
+  ":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
+  ":doc:`ROCr Debug Agent <rocr_debug_agent:index>`", "Prints the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running"
+
+Compilers
+-----------------------------------------------
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  "`FLANG <https://github.com/ROCm/flang/>`_", "An out-of-tree Fortran compiler targeting LLVM"
+  "`hipCC <https://github.com/ROCm/HIPCC>`_ ", "Compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure"
+  "`LLVM (amdclang) <https://github.com/ROCm/llvm-project>`_ ", "Toolkit for the construction of highly optimized compilers, optimizers, and runtime environments"
+
+Runtimes
+-----------------------------------------------
+
+.. csv-table::
+  :header: "Component", "Description"
+
+  ":doc:`AMD Common Language Runtime (CLR) <hip:understand/amd_clr>`", "Contains source code for AMD's common language runtimes: HIP and OpenCL"
+  ":doc:`HIP <hip:index>`", "AMD's GPU programming language extension and the GPU runtime"
+  ":doc:`ROCR-Runtime <rocr-runtime:index>`", "User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents"
--- a/tools/autotag/README.md
+++ b/tools/autotag/README.md
@@ -13,21 +13,23 @@

 ## Updating the changelog

+> IMPORTANT: It is key to update the template Markdown files in `tools/autotag/templates/rocm_changes` (eg: `5.6.0.md`) and not the `CHANGELOG.md` itself to ensure that updates are not overwritten by the autotag script. The template should only have content from changelogs that are not included by the script to avoid duplicating data.
+
 * Add or update the release specific notes in `tools/autotag/templates/rocm_changes`
-* Ensure the all the repositories have their release specific branch with the updated changelogs.
+* Ensure the all the repositories have their release specific branch with the updated changelogs
 * Run this for 5.6.0 (change for whatever version you require)
 * `GITHUB_ACCESS_TOKEN=my_token_here`

-To generate the changelog from 5.0.0 up to and including 6.0.1:
+To generate the changelog from 5.0.0 up to and including 6.1.0:

 ```sh
-python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --do-previous --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.0 6.0.1
+python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --do-previous --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.1 6.1.0
 ```

-To generate the changelog only for 6.0.1:
+To generate the changelog only for 6.1.0:

 ```sh
-python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.0 6.0.1
+python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.1 6.1.0
 ```

 ### Notes
--- a/tools/autotag/tag_script.py
+++ b/tools/autotag/tag_script.py
@@ -84,11 +84,9 @@ class TaggingArgs(argparse.Namespace):
            "MIOpenGEMM",
            "MIOpenKernels",
            "MIOpenTensile",
-            "ROCmValidationSuite",
-            "half",
-            "hipFORT",
-            "rccl-rdma-sharp-plugins",
            "MLSEQA_TestRepo",
+            "half",
+            "rccl-rdma-sharp-plugins",
        ]
        return defaults + (self._exclude if self._exclude is not None else [])

@@ -236,10 +234,16 @@ def run_tagging():

    # Find all the math libraries and their remotes.
    included_names = [
-        "rocm-cmake",
-        "MIOpen",
        "AMDMIGraphX",
-        "rocprofiler"
+        "HIPIFY", #
+        "MIOpen",
+        "MIVisionX",
+        "ROCmValidationSuite", #
+        "composable_kernel",
+        "hipfort",
+        "rocDecode",
+        "rocm-cmake",
+        "rpp",
    ]
    included_groups = [
        "mathlibs"
--- a/tools/autotag/templates/changelog.jinja
+++ b/tools/autotag/templates/changelog.jinja
@@ -27,12 +27,12 @@ This page contains the release notes for AMD ROCm Software.
 {%- set rocm_changes = "./rocm_changes/" ~ version ~ ".md" %}
 {% include rocm_changes ignore missing %}

-### Library changes in ROCM {{version}}
+### Library changes in ROCm {{version}}

 | Library | Version |
 |---------|---------|
 {%- for lib_name, lib in release.libraries | dictsort %}
-{%- if rocm_ver_by_lib_ver[lib_name][lib.lib_version] == version and lib.lib_version %}
+{%- if rocm_ver_by_lib_ver[lib_name][lib.lib_version] == version and (prev_lib_ver[lib_name][lib.lib_version] | default([]) | length > 0) and lib.lib_version %}
 | {{ lib_name }} | {{prev_lib_ver[lib_name][lib.lib_version]}} ⇒ [{{ lib.lib_version }}]({{ lib.release_url }}) |
 {%- elif lib.lib_version %}
 | {{ lib_name }} | [{{ lib.lib_version }}]({{ lib.release_url }}) |
--- a/tools/autotag/templates/rocm_changes/5.7.1.md
+++ b/tools/autotag/templates/rocm_changes/5.7.1.md
@@ -41,7 +41,7 @@ kernels found by setting the environment variable ROCBLAS_TENSILE_GEMM_OVERRIDE_
 points to the stored file.

 For more details, refer to the
-[rocBLAS Programmer's Guide](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/Programmers_Guide.html#rocblas-gemm-tune).
+[rocBLAS Programmer's Guide](https://rocm.docs.amd.com/projects/rocBLAS/en/docs-5.7.1/Programmers_Guide.html).

 #### HIP 5.7.1 (for ROCm 5.7.1)

--- a/tools/autotag/templates/rocm_changes/6.0.0.md
+++ b/tools/autotag/templates/rocm_changes/6.0.0.md
@@ -281,7 +281,7 @@ Note: These complex operations are equivalent to corresponding types/functions o
      * `HIP_ROCclr`
    * NVIDIA platform
      * `HIP_PLATFORM_NVCC`
-* The [hcc_detail](https://github.com/ROCm/clr/tree/1949b1621a802ffb1492616adbae6154bfbe64ef/hipamd/include/hip/hcc_detail) and [nvcc_detail](https://github.com/ROCm/clr/tree/1949b1621a802ffb1492616adbae6154bfbe64ef/hipamd/include/hips/nvcc_detail) directories in the clr repository are removed.
+* The `hcc_detail` and `nvcc_detail` directories in the clr repository are removed.
 * Deprecated gcnArch is removed from hip device struct `hipDeviceProp_t`.
 * Deprecated `enum hipMemoryType memoryType;` is removed from HIP struct `hipPointerAttribute_t` union.

--- a/tools/autotag/templates/rocm_changes/6.1.0.md
+++ b/tools/autotag/templates/rocm_changes/6.1.0.md
@@ -0,0 +1,319 @@
+
+The ROCm™ 6.1 release consists of new features and fixes to improve the stability and
+performance of AMD Instinct™ MI300 GPU applications. Notably, we've added:
+
+* Full support for Ubuntu 22.04.4.
+
+* **rocDecode**, a new ROCm component that provides high-performance video decode support for
+  AMD GPUs. With rocDecode, you can decode compressed video streams while keeping the resulting
+  YUV frames in video memory. With decoded frames in video memory, you can run video
+  post-processing using ROCm HIP, avoiding unnecessary data copies via the PCIe bus.
+
+  To learn more, refer to the rocDecode 
+  [documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
+
+### OS and GPU support changes
+
+ROCm 6.1 adds the following operating system support:
+
+* MI300A: Ubuntu 22.04.4 and RHEL 9.3
+* MI300X: Ubuntu 22.04.4
+
+Future releases will add additional operating systems to match our general offering. For older
+generations of supported AMD Instinct products, we’ve added Ubuntu 22.04.4 support.
+
+```{tip}
+To view the complete list of supported GPUs and operating systems, refer to the system requirements
+page for
+[Linux](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
+and
+[Windows](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html).
+```
+
+### Installation packages
+
+This release includes a new set of packages for every module (all libraries and binaries default to
+`DT_RPATH`). Package names have the suffix `rpath`; for example, the `rpath` variant of `rocminfo` is
+`rocminfo-rpath`.
+
+```{warning}
+The new `rpath` packages will conflict with the default packages; they are meant to be used only in
+environments where legacy `DT_RPATH` is the preferred form of linking (instead of `DT_RUNPATH`). We
+do **not** recommend trying to install both sets of packages.
+```
+
+#### AMD SMI
+
+AMD SMI for ROCm 6.1.0
+
+##### Additions
+
+* **Added Monitor command**. This provides users the ability to customize GPU metrics to capture,
+  collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi`
+  (no argument), and allows you to customize per the data that are helpful for your use-case.
+
+* **Integrated ESMI Tool**. You can get CPU metrics and telemetry through our API and CLI tools.
+  You can get this information using the `amd-smi static` and `amd-smi metric` commands. This is only
+  available for limited target processors. As of ROCm 6.0.2, this is listed as:
+  * AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh
+  * AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh
+
+* **Added support for new metrics: VCN, JPEG engines, and PCIe errors**. Using the AMD SMIrccl
+  tool, you can retrieve VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or
+  `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs
+  (with 4 separate VCN engine activities) for older ASICs `MM_ACTIVITY` with UVD/VCN engine activity
+  (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up
+  to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new
+  fields.
+
+* **Added AMDSMI Tool version**. AMD SMI will report *three versions*: AMDSMI Tool, AMDSMI
+  Library version, and ROCm version.
+
+  The AMDSMI Tool version is the CLI/tool version number with commit ID appended after the `+` sign.
+  The AMDSMI Library version is the library package version number. The ROCm version is the system's
+  installed ROCm version; if ROCm is not installed, it reports N/A.
+
+* **Added XGMI table**. Displays XGMI information for AMD GPU devices in a table format. This is
+  only available on supported ASICs (e.g., MI300). Here, users can view read/write data XGMI or PCIe
+  accumulated data transfer size (in KiloBytes).
+
+* **Added units of measure to JSON output.**. We added unit of measure to JSON/CSV
+  `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands.
+
+##### Changes
+
+* **Topology is now left-aligned with BDF for each device listed individual table's row/columns**.
+  We provided each device's BDF for every table's row/columns, then left-aligned data. We want AMD
+  SMI Tool output to be easy to understand and digest. Having to scroll up to find this information
+  made it difficult to follow, especially for devices that have many devices associated with one ASIC.
+
+##### Fixes
+
+* **Fix for RDNA3/RDNA2/MI100 'amdsmi_get_gpu_pci_bandwidth()' in 'frequencies_read' tests**.
+  For devices that do not report (e.g., RDNA3/RDNA2/MI100), we have added checks to confirm that
+  these devices return `AMDSMI_STATUS_NOT_SUPPORTED`. Otherwise, tests now display a return
+  string.
+
+* **Fix for devices that have an older PyYAML installed**. For platforms that are identified as having
+  an older PyYAML version or pip, we now manually update both pip and PyYAML as needed. This
+  fix impacts the following CLI commands:
+  * `amd-smi list`
+  * `amd-smi static`
+  * `amd-smi firmware`
+  * `amd-smi metric`
+  * `amd-smi topology`
+
+* **Fix for crash when user is not a member of video/render groups**. AMD SMI now uses the
+  same mutex handler for devices as ROCm SMI. This helps avoid crashes when DRM/device data are
+  inaccessible to the logged-in user.
+
+##### Known issues
+
+* There is an `AttributeError` while running `amd-smi process --csv`
+* GPU reset results in an "*Unable to reset non-amd GPU*" error
+* bad pages results with "ValueError: NULL pointer access"
+* Some RDNA3 cards may enumerate to `Slot type = UNKNOWN`
+
+#### HIP
+
+HIP 6.1 for ROCm 6.1
+
+##### Additions
+
+* New environment variable, `HIP_LAUNCH_BLOCKING`, which is used for serialization on kernel
+  execution.
+* The default value is 0 (disable): kernel runs normally, as defined in the queue
+* When set as 1 (enable): HIP runtime serializes the kernel enqueue and behaves the same as
+  `AMD_SERIALIZE_KERNEL`
+* Added HIPRTC support for hip headers `driver_types`, `math_functions`, `library_types`,
+  `math_functions`, `hip_math_constants`, `channel_descriptor`, `device_functions`, `hip_complex`,
+  `surface_types`, `texture_types`
+
+##### Changes
+
+* HIPRTC now assumes WGP mode for gfx10+. You can enable CU mode by passing `-mcumode` to the
+  compile options from `hiprtcCompileProgram`.
+
+##### Fixes
+
+* HIP complex vector type multiplication and division operations.
+  On an AMD platform, some duplicated complex operators are removed to avoid compilation failures.
+  In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex datatypes:
+  * `typedef float2 hipFloatComplex`
+  * `typedef double2 hipDoubleComplex`
+
+  Any application that uses complex multiplication and division operations must replace `*` and `/`
+  operators with the following:
+  * `hipCmulf() and hipCdivf() for hipFloatComplex`
+  * `hipCmul() and hipCdiv() for hipDoubleComplex`
+
+  Note that these complex operations are equivalent to corresponding types/functions on an NVIDIA
+  platform.
+
+#### HIPIFY
+
+HIPIFY for ROCm 6.1.0
+
+##### Additions
+
+* CUDA 12.3.2 support
+* cuDNN 8.9.7 support
+* LLVM 17.0.6 support
+* Full `hipSOLVER` support
+* Full `rocSPARSE` support
+* New option: `--amap`, which will hipify as much as possible, ignoring `--default-preprocessor`
+  behavior
+
+##### Fixes
+
+* Code blocks skipped by the preprocessor are no longer hipified under the `--default-preprocessor`
+  option
+
+#### ROCm Compiler
+
+ROCm Compiler for ROCm 6.1.0
+
+##### Additions
+
+* Compiler now generates `.uniform_work_group_size` and records it in the metadata. It indicates if the
+  kernel requires that each dimension of global size is a multiple of the corresponding dimension of
+  work-group size. A value of 1 is true, and 0 is false. This metadata is only provided when the value is
+  1.
+* Added the `rocm-llvm-docs` package.
+* Added ROCm Device-Libs, ROCm Compiler Support, and hipCC within the `llvm-project/amd`
+  subdirectory to AMD’s fork of the LLVM project.
+* Added support for C++ Parallel Algorithm Offload via HIP (HIPSTDPAR), which allows parallel
+  algorithms to run on the GPU.
+
+##### Changes
+
+* `rocm-clang-ocl` is now an optional package and will require manual installation.
+
+##### Deprecations
+
+* hipCC adds `-mllvm`, `-amdgpu-early-inline-all=true`, and `-mllvm` `-amdgpu-function-calls=false` by
+  default to compiler invocations. These flags will be removed from hipCC in a future ROCm release.
+
+##### Fixes
+
+AddressSanitizer (ASan):
+* Added `sanitized_padded_global` LLVM ir attribute to identify sanitizer instrumented globals.
+* For ASan instrumented global, emit two symbols: one with actual size and the other with
+  instrumented size.
+
+  [On GitHub](https://github.com/ROCm/ROCm/issues/2551)
+
+##### Known issues
+
+* Due to an issue within the `amd-llvm` compiler shipping with ROCm 6.1, HIPSTDPAR's interposition mode, which is enabled by `--hipstdpar-interpose-alloc` is currently broken.
+
+The temporary workaround is to use the upstream LLVM 18 (or newer) compiler. This issue will be addressed in a future ROCm release ."
+
+#### ROCm Data Center (RDC)
+
+RDC for ROCm 6.1.0
+
+##### Changes
+
+* Added `--address` flag to rdcd
+* Upgraded from C++11 to C++17
+* Upgraded gRPC
+
+#### ROCDebugger (ROCgdb)
+
+ROCgdb for ROCm 6.1.0
+
+##### Fixes
+
+Previously, ROCDebugger encountered hangs and crashes when stepping over the `s_endpgm`
+instruction at the end of a HIP kernel entry function, which caused the stepped wave to exit. This issue
+is fixed in the ROCm 6.1 release. You can now step over the last instruction of any HIP kernel without
+debugger hangs or crashes.
+
+#### ROCm SMI
+
+ROCm SMI for ROCm 6.1.0
+
+##### Additions
+
+* **Added support to set max/min clock level for sclk ('RSMI_CLK_TYPE_SYS') or mclk ('RSMI_CLK_TYPE_MEM')**.
+  You can now set a maximum or minimum `sclk` or `mclk` value through the
+  `rsmi_dev_clk_extremum_set()` API provided ASIC support. Alternatively, you can use our Python CLI
+  tool (`rocm-smi --setextremum max sclk 1500`).
+
+* **Added `rsmi_dev_target_graphics_version_get()`**. You can now query through ROCm SMI API
+  (`rsmi_dev_target_graphics_version_get()`) to retreive the target graphics version for a GPU device.
+  Currently, this output is not supplied through our ROCm SMI CLI.
+
+##### Changes
+
+* **Removed non-unified API headers: Individual GPU metric APIs are no longer supported**.
+  The individual metric APIs (`rsmi_dev_metrics_*`) were removed in order to keep updates easier for
+  new GPU metric support. By providing a simple API (`rsmi_dev_gpu_metrics_info_get()`) with its
+  reported device metrics, it is worth noting there is a risk for ABI break-age using
+  `rsmi_dev_gpu_metrics_info_get()`. It is vital to understand that ABI breaks are necessary (in some
+  cases) in order to support newer ASICs and metrics for our customers. We will continue to support
+  `rsmi_dev_gpu_metrics_info_get()` with these considerations and limitations in mind.
+
+* **Deprecated 'rsmi_dev_power_ave_get()'; use the newer API, 'rsmi_dev_power_get()'**. As
+  outlined in the change for 6.0.0 (*Added a generic power API: rsmi_dev_power_get*), is now
+  deprecated. You must update your ROCm SMI API calls accordingly.
+
+##### Fixes
+
+* Fixed `--showpids` reporting `[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN`.
+  Output was failing because `cu_occupancy debugfs` method is not provided on some graphics cards
+  by design. `get_compute_process_info_by_pid` was updated to reflect this and returns with the output
+  needed by the CLI.
+
+* Fixed `rocm-smi --showpower` output, which was inconsistent on some RDNA3 devices.
+  We updated this to use `rsmi_dev_power_get()` within the CLI to provide a consistent device power
+  output. This was caused by using the now-deprecated `rsmi_dev_average_power_get()` API.
+
+* Fixed `rocm-smi --setcomputepartition` and `rocm-smi --resetcomputepartition` to notate if device is
+  `EBUSY`
+
+* Fixed `rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition` read only SYSFS to
+  return `RSMI_STATUS_NOT_SUPPORTED`
+  The  `rsmi_dev_memory_partition_set` API is updated to handle the read-only SYSFS check.
+  Corresponding tests and CLI (`rocm-smi --setmemorypartition` and
+  `rocm-smi --resetmemorypartition`) calls were updated accordingly.
+
+* Fixed `rocm-smi --showclkvolt` and `rocm-smi --showvc`, which were displaying 0 for overdrive and
+  that the voltage curve is not supported.
+
+#### ROCProfiler
+
+ROCProfiler for ROCm 6.1.0
+
+##### Fixes
+
+* Fixed ROCprofiler to match versioning changes in HIP Runtime
+* Fixed plugins race condition
+* Updated metrics to MI300
+
+#### ROCm Validation Suite
+
+##### Known issue
+
+* In a future release, the ROCm Validation Suite P2P Benchmark and Qualification Tool (PBQT) tests will be optimized to meet the target bandwidth requirements for MI300X.
+
+  [On GitHub](https://github.com/ROCm/ROCm/issues/3027)
+
+#### MI200 SR-IOV 
+
+##### Known issue
+
+* Multimedia applications may encounter compilation errors in the MI200 Single Root Input/Output Virtualization (SR-IOV) environment. This is because MI200 SR-IOV does not currently support multimedia applications. 
+
+  [On GitHub](https://github.com/ROCm/ROCm/issues/3028)
+
+### AMD MI300A RAS
+
+#### Fixed defect
+
+##### GFX correctable and uncorrectable error inject failures
+
+* Previously, the AMD CPU Reliability, Availability, and Serviceability (RAS) installation encountered correctable and uncorrectable failures while injecting an error.
+
+  This issue is resolved in the ROCm 6.1 release, and users will no longer encounter the GFX correctable error (CE) and uncorrectable error (UE) failures.
--- a/tools/autotag/util/init.py
+++ b/tools/autotag/util/init.py
@@ -1,2 +1,2 @@
 from .defaults import TEMPLATES, PROCESSORS
-from . import mivisionx
+from .custom_templates import hipfort, mivisionx, rpp, rvs
--- a/tools/autotag/util/custom_templates/init.py
+++ b/tools/autotag/util/custom_templates/init.py
--- a/tools/autotag/util/custom_templates/ck.py
+++ b/tools/autotag/util/custom_templates/ck.py
@@ -0,0 +1,41 @@
+import re
+
+from util.release_data import ReleaseLib
+from util.defaults import TEMPLATES, PROCESSORS
+
+TEMPLATES['composable_kernel'] = (
+    (
+        r"## (\(Unreleased\))? CK (?P<lib_version>\d+\.\d+(?:\.\d+))?"
+        r"(?P<for_rocm> for ROCm )?"
+        r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
+        r"\n"
+        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
+    )
+)
+
+
+def composable_kernel_processor(data: ReleaseLib, template: str, _, __) -> bool:
+    """Processor for releases."""
+    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
+    changelog = changelog.decoded_content.decode()
+    pattern = re.compile(template)
+    match = pattern.search(changelog)
+    lib_version  = match["lib_version"]
+    data.message = (
+        f"composable_kernel for ROCm"
+        f" {data.full_version}"
+    )
+
+    data.lib_version = lib_version
+    data.notes = f"""{match["body"]}"""
+    
+    change_pattern = re.compile(
+        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
+        re.RegexFlag.MULTILINE
+    )
+    for match in change_pattern.finditer(data.notes):
+        data.data.changes[match["type"]] = match["change"]
+    
+    return True
+
+PROCESSORS['composable_kernel'] = composable_kernel_processor
--- a/tools/autotag/util/custom_templates/hipfort.py
+++ b/tools/autotag/util/custom_templates/hipfort.py
@@ -0,0 +1,42 @@
+import re
+
+from util.release_data import ReleaseLib
+from util.defaults import TEMPLATES, PROCESSORS
+
+TEMPLATES['hipfort'] = (
+    (
+        r"## hipfort (?P<lib_version>\d+\.\d+(?:\.\d+))?"
+        r"(?P<for_rocm> for ROCm )?"
+        r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
+        r"( \(Unreleased\))?"
+        r"\n"
+        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
+    )
+)
+
+
+def hipfort_processor(data: ReleaseLib, template: str, _, __) -> bool:
+    """Processor for releases."""
+    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
+    changelog = changelog.decoded_content.decode()
+    pattern = re.compile(template)
+    match = pattern.search(changelog)
+    lib_version  = match["lib_version"]
+    data.message = (
+        f"hipfort for ROCm"
+        f" {data.full_version}"
+    )
+
+    data.lib_version = lib_version
+    data.notes = f"""{match["body"]}"""
+    
+    change_pattern = re.compile(
+        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
+        re.RegexFlag.MULTILINE
+    )
+    for match in change_pattern.finditer(data.notes):
+        data.data.changes[match["type"]] = match["change"]
+    
+    return True
+
+PROCESSORS['hipfort'] = hipfort_processor
--- a/tools/autotag/util/custom_templates/mivisionx.py
+++ b/tools/autotag/util/custom_templates/mivisionx.py
@@ -13,12 +13,13 @@ TEMPLATES['MIVisionX'] = (
 )


-def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
+def mivisionx_processor(data: ReleaseLib, template: str, _, __) -> bool:
    """Processor for MIVisionX releases."""
    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
    changelog = changelog.decoded_content.decode()
    pattern = re.compile(template)
    match = pattern.search(changelog)
+    lib_version  = match["lib_version"]
    data.message = (
        f"MIVisionX for ROCm"
        f" {data.full_version}"
@@ -27,19 +28,18 @@ def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
    readme = data.repo.get_contents("README.md", data.commit)
    readme = readme.decoded_content.decode()
    dependency_map = readme[readme.find("## MIVisionX Dependency Map"):]
-    data.notes = f"""
-<p align="center">
-    <img width="70%"
-        src="https://github.com/ROCm/MIVisionX/raw/master/docs/images/MIVisionX.png" />
-</p>
-
-## Online Documentation
-[MIVisionX Documentation](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html)
-## MIVisionX {match['lib_version']}
-{match["body"]}
+    data.lib_version = lib_version
+    data.notes = f"""{match["body"]}
 {dependency_map}
 """
+    
+    change_pattern = re.compile(
+        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
+        re.RegexFlag.MULTILINE
+    )
+    for match in change_pattern.finditer(data.notes):
+        data.data.changes[match["type"]] = match["change"]
+    
    return True

-
 PROCESSORS['MIVisionX'] = mivisionx_processor
--- a/tools/autotag/util/custom_templates/rpp.py
+++ b/tools/autotag/util/custom_templates/rpp.py
@@ -0,0 +1,42 @@
+import re
+
+from util.release_data import ReleaseLib
+from util.defaults import TEMPLATES, PROCESSORS
+
+TEMPLATES['rpp'] = (
+    (
+        r"## RPP (?P<lib_version>\d+\.\d+(?:\.\d+))?"
+        r"(?P<for_rocm> for ROCm )?"
+        r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
+        r"( \(Unreleased\))?"
+        r"\n"
+        r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
+    )
+)
+
+
+def rpp_processor(data: ReleaseLib, template: str, _, __) -> bool:
+    """Processor for releases."""
+    changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
+    changelog = changelog.decoded_content.decode()
+    pattern = re.compile(template)
+    match = pattern.search(changelog)
+    lib_version  = match["lib_version"]
+    data.message = (
+        f"rpp for ROCm"
+        f" {data.full_version}"
+    )
+
+    data.lib_version = lib_version
+    data.notes = f"""{match["body"]}"""
+    
+    change_pattern = re.compile(
+        r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
+        re.RegexFlag.MULTILINE
+    )
+    for match in change_pattern.finditer(data.notes):
+        data.data.changes[match["type"]] = match["change"]
+    
+    return True
+
+PROCESSORS['rpp'] = rpp_processor