Update documentation requirements

Merge pull request #3568 from amd-jnovotny/peak-tflops-typo-docs610
Fix typo for TFLOPs metric in MI250 architecture page: cherry pick to docs/6.1.0
2026-01-09 22:58:17 -05:00 · 2024-09-16 10:13:11 -08:00 · 2024-08-12 13:17:44 -04:00 · 2024-08-12 10:24:24 -04:00 · 2024-08-07 12:42:50 -04:00 · 2024-08-06 15:56:06 -04:00
197 changed files with 15213 additions and 6102 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1,5 @@
-* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus @LisaDelaney
+* @amd-aakash @jlgreathouse @samjwu @ROCm/rocm-documentation
+# Documentation files
+docs/* @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
--- a/.github/ISSUE_TEMPLATE/0_issue_report.yml
+++ b/.github/ISSUE_TEMPLATE/0_issue_report.yml
@@ -1,76 +0,0 @@
-name: Issue Report
-description: File a report for something not working correctly.
-title: "[Issue]: "
-
-body:
- type: markdown
-  attributes:
-    value: |
-      Thank you for taking the time to fill out this report!
-
-      On a Linux system, you can acquire your OS, CPU, GPU, and ROCm version (for filling out this report) with the following commands:
-      echo "OS:" && cat /etc/os-release | grep -E "^(NAME=|VERSION=)";
-      echo "CPU: " && cat /proc/cpuinfo | grep "model name" | sort --unique;
-      echo "GPU:" && /opt/rocm/bin/rocminfo | grep -E "^\s*(Name|Marketing Name)";
-      echo "ROCm in /opt:" && ls -1 /opt | grep -E "rocm-";
- type: textarea
-  attributes:
-    label: Problem Description
-    description: Describe the issue you encountered.
-    placeholder: "The steps to reproduce can be included here, or in the dedicated section further below."
-  validations:
-    required: true
- type: input
-  attributes:
-    label: Operating System
-    description: What is the name and version number of the OS?
-    placeholder: "e.g. Ubuntu 22.04.3 LTS (Jammy Jellyfish)"
-  validations:
-    required: true
- type: input
-  attributes:
-    label: CPU
-    description: What CPU did you encounter the issue on?
-    placeholder: "e.g. AMD Ryzen 9 5900HX with Radeon Graphics"
-  validations:
-    required: true
- type: input
-  attributes:
-    label: GPU
-    description: What GPU(s) did you encounter the issue on?
-    placeholder: "e.g. MI200"
-  validations:
-    required: true
- type: input
-  attributes:
-    label: ROCm Version
-    description: What version(s) of ROCm did you encounter the issue on?
-    placeholder: "e.g. 5.7.0"
-  validations:
-    required: true
- type: input
-  attributes:
-    label: ROCm Component
-    description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here.
-    placeholder: "e.g. rocBLAS"
-
- type: textarea
-  attributes:
-    label: Steps to Reproduce
-    description: (Optional) Detailed steps to reproduce the issue.
-    placeholder: Please also include what you expected to happen, and what actually did, at the failing step(s).
-  validations:
-    required: false
-
- type: textarea
-  attributes:
-    label: Output of /opt/rocm/bin/rocminfo --support
-    description: The output of rocminfo --support will help to better address the problem.
-    placeholder: |
-      ROCk module is loaded
-      =====================
-      HSA System Attributes
-      =====================
-      [...]
-  validations:
-    required: true
--- a/.github/ISSUE_TEMPLATE/1_feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/1_feature_request.yml
@@ -1,32 +0,0 @@
-name: Feature Suggestion
-description: Suggest an additional functionality, or new way of handling an existing functionality.
-title: "[Feature]: "
-
-body:
- type: markdown
-  attributes:
-    value: |
-      Thank you for taking the time to make a suggestion!
-
- type: textarea
-  attributes:
-    label: Suggestion Description
-    description: Describe your suggestion.
-  validations:
-    required: true
- type: input
-  attributes:
-    label: Operating System
-    description: (Optional) If this is for a specific OS, you can mention it here.
-    placeholder: "e.g. Ubuntu"
- type: input
-  attributes:
-    label: GPU
-    description: (Optional) If this is for a specific GPU or GPU family, you can mention it here.
-    placeholder: "e.g. MI200"
- type: input
-  attributes:
-    label: ROCm Component
-    description: (Optional) If this issue relates to a specific ROCm component, it can be mentioned here.
-    placeholder: "e.g. rocBLAS"
-
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +0,0 @@
-blank_issues_enabled: false
-contact_links:
-  - name: ROCm Community Discussions
-    url: https://github.com/RadeonOpenCompute/ROCm/discussions
-    about: Please ask and answer questions here for anything ROCm.
--- a/.github/workflows/issue_retrieval.yml
+++ b/.github/workflows/issue_retrieval.yml
@@ -0,0 +1,22 @@
+name: Issue retrieval
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  auto-retrieve:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Generate a token
+        id: generate_token
+        uses: actions/create-github-app-token@v1
+        with:
+          app_id: ${{ secrets.ACTION_APP_ID }}
+          private_key: ${{ secrets.ACTION_PEM }}
+      - name: 'Retrieve Issue'
+        uses: abhimeda/rocm_issue_management@main
+        with:
+          authentication-token: ${{ steps.generate_token.outputs.token }}
+          github-organization: 'ROCm'
+          project-num: '6'
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -17,4 +17,4 @@ on:
 jobs:
  call-workflow-passing-data:
    name: Documentation
-    uses: RadeonOpenCompute/rocm-docs-core/.github/workflows/linting.yml@develop
+    uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop
--- a/.markdownlint-cli2.yaml
+++ b/.markdownlint-cli2.yaml
@@ -13,6 +13,5 @@ config:
  MD051: false
 ignores:
  - CHANGELOG.md
-  - docs/CHANGELOG.md
  - "{,docs/}{RELEASE,release}.md"
  - tools/autotag/templates/**/*.md
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,16 +3,19 @@

 version: 2

-sphinx:
-   configuration: docs/conf.py
-
-formats: [htmlzip, pdf]
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.10"
+   apt_packages:
+     - "doxygen"
+     - "graphviz" # For dot graphs in doxygen

 python:
   install:
   - requirements: docs/sphinx/requirements.txt

-build:
-   os: ubuntu-20.04
-   tools:
-      python: "3.8"
+sphinx:
+   configuration: docs/conf.py
+
+formats: []
--- a/.wordlist.txt
+++ b/.wordlist.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,40 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cmake_minimum_required(VERSION 3.18.0)
+
+project(ROCm VERSION 5.7.1 LANGUAGES NONE)
+
+option(BUILD_DOCS "Build ROCm documentation" ON)
+
+include(GNUInstallDirs)
+
+# Adding default path cmake modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
+
+# Handle dependencies
+include(Dependencies)
+
+# Build docs
+if(BUILD_DOCS)
+  add_subdirectory(docs)
+endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,229 +1,94 @@
-# Contributing to ROCm documentation
-
-AMD values and encourages contributions to our code and documentation. If you choose to
-contribute, we encourage you to be polite and respectful. Improving documentation is a long-term
-process, to which we are dedicated.
-
-If you have issues when trying to contribute, refer to the
-[discussions](https://github.com/RadeonOpenCompute/ROCm/discussions) page in our GitHub
-repository.
-
-## Folder structure and naming convention
-
-Our documentation follows the Pitchfork folder structure. Most documentation files are stored in the
-`/docs` folder. Some special files (such as release, contributing, and changelog) are stored in the root
-(`/`) folder.
-
-All images are stored in the `/docs/data` folder. An image's file path mirrors that of the documentation
-file where it is used.
-
-Our naming structure uses kebab case; for example, `my-file-name.rst`.
-
-## Supported formats and syntax
-
-Our documentation includes both Markdown and RST files. We are gradually transitioning existing
-Markdown to RST in order to more effectively meet our documentation needs. When contributing,
-RST is preferred; if you must use Markdown, use GitHub-flavored Markdown.
-
-We use [Sphinx Design](https://sphinx-design.readthedocs.io/en/latest/index.html) syntax and compile
-our API references using [Doxygen](https://www.doxygen.nl/).
-
-The following table shows some common documentation components and the syntax convention we
-use for each:
-
-<table>
-<tr>
-<th>Component</th>
-<th>RST syntax</th>
-</tr>
-<tr>
-<td>Code blocks</td>
-<td>
-
-```rst
-
-.. code-block:: language-name
-
-  My code block.
-
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Cross-referencing internal files</td>
-<td>
-
-```rst
-
-:doc:`Title <../path/to/file/filename>`
-
-```
-
-</td>
-</tr>
-<tr>
-<td>External links</td>
-<td>
-
-```rst
-
-`link name  <URL>`_
-
-```
-
-</td>
-</tr>
-<tr>
-<tr>
-<td>Headings</td>
-<td>
-
-```rst
-
-******************
-Chapter title (H1)
-******************
-
-Section title (H2)
-===============
-
-Subsection title (H3)
---------------------
-
-Sub-subsection title (H4)
-^^^^^^^^^^^^^^^^^^^^
-
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Images</td>
-<td>
-
-```rst
-
-.. image:: image1.png
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Internal links</td>
-<td>
-
-```rst
-
-1. Add a tag to the section you want to reference:
-
-.. _my-section-tag: section-1
-
-Section 1
-==========
-
-2. Link to your tag:
-
-As shown in :ref:`section-1`.
-
-```
-
-</td>
-</tr>
-<tr>
-<tr>
-<td>Lists</td>
-<td>
-
-```rst
-
-# Ordered (numbered) list item
-
-* Unordered (bulleted) list item
-
-```
-
-</td>
-</tr>
-<tr>
-<tr>
-<td>Math (block)</td>
-<td>
-
-```rst
-
-.. math::
-
-  A = \begin{pmatrix}
-          0.0 & 1.0 & 1.0 & 3.0 \\
-          4.0 & 5.0 & 6.0 & 7.0 \\
-        \end{pmatrix}
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Math (inline)</td>
-<td>
-
-```rst
-
-:math:`2 \times 2 `
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Notes</td>
-<td>
-
-```rst
-
-.. note::
-
-  My note here.
-
-```
-
-</td>
-</tr>
-<tr>
-<td>Tables</td>
-<td>
-
-```rst
-
-.. csv-table::  Optional title here
-  :widths: 30, 70  #optional column widths
-  :header: "entry1 header", "entry2 header"
-
-   "entry1", "entry2"
-
-```
-
-</td>
-</tr>
-</table>
-
-## Language and style
-
-We use the
-[Google developer documentation style guide](https://developers.google.com/style/highlights) to
-guide our content.
-
-Font size and type, page layout, white space control, and other formatting
-details are controlled via
-[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core). If you want to notify us
-of any formatting issues, create a pull request in our
-[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) GitHub repository.
-
-## Building our documentation
-
-<!--  % TODO: Fix the link to be able to work at every files  -->
-To learn how to build our documentation, refer to
-[Building documentation](./building.md).
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Contributing to ROCm">
+  <meta name="keywords" content="ROCm, contributing, contribute, maintainer, contributor">
+</head>
+
+# Contribute to ROCm
+
+AMD values and encourages contributions to our code and documentation. If you want to contribute
+to our ROCm repositories, first review the following guidance. For documentation-specific information,
+see [Contributing to ROCm docs](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
+ROCm is a software stack made up of a collection of drivers, development tools, and APIs that enable
+GPU programming from low-level kernel to end-user applications. Because some of our components
+are inherited from external projects (such as
+[LLVM](https://github.com/ROCm/llvm-project) and
+[Kernel driver](https://github.com/ROCm/ROCK-Kernel-Driver)), these use
+project-specific contribution guidelines and workflow. Refer to their repositories for more information.
+All other ROCm components follow the workflow described in the following sections.
+
+## Development workflow
+
+ROCm uses GitHub to host code, collaborate, and manage version control. We use pull requests (PRs)
+for all changes within our repositories. We use
+[GitHub issues](https://github.com/ROCm/ROCm/issues) to track known issues, such as
+bugs.
+
+### Issue tracking
+
+Before filing a new issue, search the
+[existing issues](https://github.com/ROCm/ROCm/issues) to make sure your issue isn't
+already listed.
+
+General issue guidelines:
+
+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
+  comment or post to provide additional details, such as how you reproduced this issue.
+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
+  You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
+  your issue as being the same as the existing issue, we'll close the duplicate.
+* If your issue doesn't exist, use the issue template to file a new issue.
+  * When filing an issue, be sure to provide as much information as possible, including script output so
+    we can collect information about your configuration. This helps reduce the time required to
+    reproduce your issue.
+  * Check your issue regularly, as we may require additional information to successfully reproduce the
+    issue.
+
+### Pull requests
+
+When you create a pull request, you should target the default branch.  Our repositories typically use the **develop** branch as the default integration branch.
+
+When creating a PR, use the following process. Note that each repository may include additional,
+project-specific steps. Refer to each repository's PR process for any additional steps.
+
+* Identify the issue you want to fix
+* Target the default branch (usually the **develop** branch) for integration
+* Ensure your code builds successfully
+* Each component has a suite of test cases to run; include the log of the successful test run in your PR
+* Do not break existing test cases
+* New functionality is only merged with new unit tests
+  * If your PR includes a new feature, you must provide an application or test so we can ensure that the
+    feature works and continues to be valid in the future
+* Tests must have good code coverage
+* Submit your PR and work with the reviewer or maintainer to get your PR approved
+* Once approved, the PR is brought onto internal CI systems and may be merged into the component
+  during our release cycle, as coordinated by the maintainer
+* We'll inform you once your change is committed
+
+:::{important}
+By creating a PR, you agree to allow your contribution to be licensed under the
+terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
+licenses.
+:::
+
+You can look up each license on the [ROCm licensing](https://rocm.docs.amd.com/en/latest/about/license.html) page.
+
+### New feature development
+
+Use the [GitHub Discussion forum](https://github.com/ROCm/ROCm/discussions)
+(Ideas category) to propose new features. Our maintainers are happy to provide direction and
+feedback on feature development.
+
+### Documentation
+
+Submit ROCm documentation changes to our
+[documentation repository](https://github.com/ROCm/ROCm). You must update
+documentation related to any new feature or API contribution.
+
+Note that each ROCm project uses its own repository for documentation.
+
+## Future development workflow
+
+The current ROCm development workflow is GitHub-based. If, in the future, we change this platform,
+the tools and links may change. In this instance, we will update contribution guidelines accordingly.
--- a/GOVERNANCE.md
+++ b/GOVERNANCE.md
@@ -0,0 +1,60 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm governance model">
+  <meta name="keywords" content="ROCm, governance">
+</head>
+
+# Governance model
+
+ROCm is a software stack made up of a collection of drivers, development tools, and APIs that enable
+GPU programming from the low-level kernel to end-user applications.
+
+Components of ROCm that are inherited from external projects (such as
+[LLVM](https://github.com/ROCm/llvm-project) and
+[Kernel driver](https://github.com/ROCm/ROCK-Kernel-Driver)) follow their own
+governance model and code of conduct. All other components of ROCm are governed by this
+document.
+
+## Governance
+
+ROCm is led and managed by AMD.
+
+We welcome contributions from the community. Our maintainers review all proposed changes to
+ROCm.
+
+## Roles
+
+* **Maintainers** are responsible for their designated component and repositories.
+* **Contributors** provide input and suggest changes to existing components.
+
+### Maintainers
+
+Maintainers are appointed by AMD. They are able to approve changes and can commit to our
+repositories. They must use pull requests (PRs) for all changes.
+
+You can find the list of maintainers in the CODEOWNERS file of each repository. Code owners differ
+between repositories.
+
+### Contributors
+
+If you're not a maintainer, you're a contributor. We encourage the ROCm community to contribute in
+several ways:
+
+* Help other community members by posting questions or solutions on our
+  [GitHub discussion forums](https://github.com/ROCm/ROCm/discussions)
+* Notify us of a bugs by filing an issue report on
+  [GitHub Issues](https://github.com/ROCm/ROCm/issues)
+* Improve our documentation by submitting a PR to our
+  [repository](https://github.com/ROCm/ROCm/)
+* Improve the code base (for smaller or contained changes) by submitting a PR to the component
+* Suggest larger features by adding to the *Ideas* category in the
+  [GitHub discussion forum](https://github.com/ROCm/ROCm/discussions)
+
+For more information, refer to our [contribution guidelines](CONTRIBUTING.md).
+
+## Code of conduct
+
+To engage with any AMD ROCm component that is hosted on GitHub, you must abide by the
+[GitHub community guidelines](https://docs.github.com/en/site-policy/github-terms/github-community-guidelines)
+and the
+[GitHub community code of conduct](https://docs.github.com/en/site-policy/github-terms/github-community-code-of-conduct).
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# AMD ROCm™ platform
+# AMD ROCm Software

 ROCm is an open-source stack, composed primarily of open-source software, designed for graphics
 processing unit (GPU) computation. ROCm consists of a collection of drivers, development tools, and
@@ -10,7 +10,7 @@ ecosystem. ROCm is particularly well-suited to GPU-accelerated high-performance
 artificial intelligence (AI), scientific computing, and computer aided design (CAD).

 ROCm is powered by AMD’s
-[Heterogeneous-computing Interface for Portability (HIP)](https://github.com/ROCm-Developer-Tools/HIP),
+[Heterogeneous-computing Interface for Portability (HIP)](https://github.com/ROCm/HIP),
 an open-source software C++ GPU programming environment and its corresponding runtime. HIP
 allows ROCm developers to create portable applications on different platforms by deploying code on a
 range of platforms, from dedicated gaming GPUs to exascale HPC clusters.
@@ -19,32 +19,81 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.

+## Getting the ROCm Source Code
+
+AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
+
+### Installing the repo tool
+
+The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
+
+```bash
+mkdir -p ~/bin/
+curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
+chmod a+x ~/bin/repo
+```
+
+**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
+
+### Installing git-lfs
+
+Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
+
+```bash
+sudo apt-get install git-lfs
+```
+
+### Downloading the ROCm source code
+
+The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
+
+```bash
+mkdir -p ~/ROCm/
+cd ~/ROCm/
+~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.0.x
+~/bin/repo sync
+```
+
+**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
+
+### Building the ROCm source code
+
+Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
+
+Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
+
 ## ROCm documentation

-This repository contains the manifest file for ROCm releases, changelogs, and release information.
+This repository contains the [manifest file](https://gerrit.googlesource.com/git-repo/+/HEAD/docs/manifest-format.md)
+for ROCm releases, changelogs, and release information.

 The `default.xml` file contains information for all repositories and the associated commit used to build
-the current ROCm release; `default.xml` uses the Manifest Format repository.
+the current ROCm release; `default.xml` uses the [Manifest Format repository](https://gerrit.googlesource.com/git-repo/).

 Source code for our documentation is located in the `/docs` folder of most ROCm repositories. The
 `develop` branch of our repositories contains content for the next ROCm release.

 The ROCm documentation homepage is [rocm.docs.amd.com](https://rocm.docs.amd.com).

-### Building our documentation
+### Building the documentation

 For a quick-start build, use the following code. For more options and detail, refer to
-[Building documentation](./contribute/building.md).
+[Building documentation](./docs/contribute/building.md).

 ```bash
 cd docs
-
 pip3 install -r sphinx/requirements.txt
-
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```

+Alternatively, CMake build is supported.
+
+```bash
+cmake -B build
+cmake --build build --target=doc
+```
+
 ## Older ROCm releases

-For release information for older ROCm releases, refer to
-[`CHANGELOG`](./CHANGELOG.md).
+For release information for older ROCm releases, refer to the
+[CHANGELOG](./CHANGELOG.md).
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,7 +1,4 @@
-# Release Notes
-<!-- Do not edit this file! This file is autogenerated with -->
-<!--   tools/autotag/tag_script.py                          -->
-
+# ROCm 6.1 release highlights
 <!-- Disable lints since this is an auto-generated file.    -->
 <!-- markdownlint-disable blanks-around-headers             -->
 <!-- markdownlint-disable no-duplicate-header               -->
@@ -11,65 +8,245 @@

 <!-- spellcheck-disable -->

-Welcome to the release notes for the ROCm platform.
+The ROCm™ 6.1 release consists of new features and fixes to improve the stability and
+performance of AMD Instinct™ MI300 GPU applications. Notably, we've added:

-------------------
+* Full support for Ubuntu 22.04.4.

-## ROCm 5.7.1
-<!-- markdownlint-disable first-line-h1 -->
-<!-- markdownlint-disable no-duplicate-header -->
+* **rocDecode**, a new ROCm component that provides high-performance video decode support for
+  AMD GPUs. With rocDecode, you can decode compressed video streams while keeping the resulting
+  YUV frames in video memory. With decoded frames in video memory, you can run video
+  post-processing using ROCm HIP, avoiding unnecessary data copies via the PCIe bus.

-### What's New in This Release
+  To learn more, refer to the rocDecode 
+  [documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).

-### ROCm Libraries
+## OS and GPU support changes

-#### rocBLAS
-A new functionality rocblas-gemm-tune and an environment variable ROCBLAS_TENSILE_GEMM_OVERRIDE_PATH are added to rocBLAS in the ROCm 5.7.1 release.
+ROCm 6.1 adds the following operating system support:

-*rocblas-gemm-tune* is used to find the best-performing GEMM kernel for each GEMM problem set. It has a command line interface, which mimics the --yaml input used by rocblas-bench. To generate the expected --yaml input, profile logging can be used, by setting the environment variable ROCBLAS_LAYER4.
+* MI300A: Ubuntu 22.04.4 and RHEL 9.3
+* MI300X: Ubuntu 22.04.4

-For more information on rocBLAS logging, see Logging in rocBLAS, in the [API Reference Guide](https://rocm.docs.amd.com/projects/rocBLAS/en/docs-5.7.1/API_Reference_Guide.html#logging-in-rocblas).
+Future releases will add additional operating systems to match the general offering. For older
+generations of supported AMD Instinct products, we’ve added Ubuntu 22.04.4 support.

-An example input file: Expected output (note selected GEMM idx may differ): Where the far right values (solution_index) are the indices of the best-performing kernels for those GEMMs in the rocBLAS kernel library. These indices can be directly used in future GEMM calls. See rocBLAS/samples/example_user_driven_tuning.cpp for sample code of directly using kernels via their indices.
+```{tip}
+To view the complete list of supported GPUs and operating systems, refer to the system requirements
+page for
+[Linux](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
+and
+[Windows](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html).
+```

-If the output is stored in a file, the results can be used to override default kernel selection with the kernels found, by setting the environment variable ROCBLAS_TENSILE_GEMM_OVERRIDE_PATH, where points to the stored file.
+## Installation packages

-For more details, refer to the [rocBLAS Programmer's Guide.](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/Programmers_Guide.html#rocblas-gemm-tune)
+This release includes a new set of packages for every module (all libraries and binaries default to
+`DT_RPATH`). Package names have the suffix `rpath`; for example, the `rpath` variant of `rocminfo` is
+`rocminfo-rpath`.

-#### HIP 5.7.1 (for ROCm 5.7.1)
+```{warning}
+The new `rpath` packages will conflict with the default packages; they are meant to be used only in
+environments where legacy `DT_RPATH` is the preferred form of linking (instead of `DT_RUNPATH`). We
+do **not** recommend installing both sets of packages.
+```

-ROCm 5.7.1 is a point release with several bug fixes in the HIP runtime.
+## ROCm components

-### Fixed defects
-The *hipPointerGetAttributes* API returns the correct HIP memory type as *hipMemoryTypeManaged* for managed memory.
+The following sections highlight select component-specific changes. For additional details, refer to the
+[Changelog](https://rocm.docs.amd.com/en/develop/about/CHANGELOG.html).

-### Library Changes in ROCM 5.7.1
+### AMD System Management Interface (SMI) Tool

-| Library | Version |
-|---------|---------|
-| hipBLAS | [1.1.0](https://github.com/ROCmSoftwarePlatform/hipBLAS/releases/tag/rocm-5.7.1) |
-| hipCUB | [2.13.1](https://github.com/ROCmSoftwarePlatform/hipCUB/releases/tag/rocm-5.7.1) |
-| hipFFT | [1.0.12](https://github.com/ROCmSoftwarePlatform/hipFFT/releases/tag/rocm-5.7.1) |
-| hipSOLVER | 1.8.1 ⇒ [1.8.2](https://github.com/ROCmSoftwarePlatform/hipSOLVER/releases/tag/rocm-5.7.1) |
-| hipSPARSE | [2.3.8](https://github.com/ROCmSoftwarePlatform/hipSPARSE/releases/tag/rocm-5.7.1) |
-| MIOpen | [2.19.0](https://github.com/ROCmSoftwarePlatform/MIOpen/releases/tag/rocm-5.7.1) |
-| rocALUTION | [2.1.11](https://github.com/ROCmSoftwarePlatform/rocALUTION/releases/tag/rocm-5.7.1) |
-| rocBLAS | [3.1.0](https://github.com/ROCmSoftwarePlatform/rocBLAS/releases/tag/rocm-5.7.1) |
-| rocFFT | [1.0.24](https://github.com/ROCmSoftwarePlatform/rocFFT/releases/tag/rocm-5.7.1) |
-| rocm-cmake | [0.10.0](https://github.com/RadeonOpenCompute/rocm-cmake/releases/tag/rocm-5.7.1) |
-| rocPRIM | [2.13.1](https://github.com/ROCmSoftwarePlatform/rocPRIM/releases/tag/rocm-5.7.1) |
-| rocRAND | [2.10.17](https://github.com/ROCmSoftwarePlatform/rocRAND/releases/tag/rocm-5.7.1) |
-| rocSOLVER | [3.23.0](https://github.com/ROCmSoftwarePlatform/rocSOLVER/releases/tag/rocm-5.7.1) |
-| rocSPARSE | [2.5.4](https://github.com/ROCmSoftwarePlatform/rocSPARSE/releases/tag/rocm-5.7.1) |
-| rocThrust | [2.18.0](https://github.com/ROCmSoftwarePlatform/rocThrust/releases/tag/rocm-5.7.1) |
-| rocWMMA | [1.2.0](https://github.com/ROCmSoftwarePlatform/rocWMMA/releases/tag/rocm-5.7.1) |
-| Tensile | [4.38.0](https://github.com/ROCmSoftwarePlatform/Tensile/releases/tag/rocm-5.7.1) |
+* **New monitor command for GPU metrics**.
+  Use the monitor command to customize, capture, collect, and observe GPU metrics on
+  target devices.

-#### hipSOLVER 1.8.2
+* **Integration with E-SMI**.
+  The EPYC™ System Management Interface In-band Library is a Linux C-library that provides in-band
+  user space software APIs to monitor and control your CPU’s power, energy, performance, and other
+  system management functionality. This integration enables access to CPU metrics and telemetry
+  through the AMD SMI API and CLI tools.

-hipSOLVER 1.8.2 for ROCm 5.7.1
+### Composable Kernel (CK)

-##### Fixed
+* **New architecture support**.
+  CK now supports to the following architectures to enable efficient image denoising on the following
+  AMD GPUs: gfx1030, gfx1100, gfx1031, gfx1101, gfx1032, gfx1102, gfx1034, gfx1103, gfx1035,
+  gfx1036

- Fixed conflicts between the hipsolver-dev and -asan packages by excluding
-  hipsolver_module.f90 from the latter
+* **FP8 rounding logic is replaced with stochastic rounding**.
+  Stochastic rounding mimics a more realistic data behavior and improves model convergence.
+
+### HIP
+
+* **New environment variable to enable kernel run serialization**.
+  The default `HIP_LAUNCH_BLOCKING` value is `0` (disable); which causes kernels to run as defined in
+  the queue. When set to `1` (enable), the HIP runtime serializes the kernel queue, which behaves the
+  same as `AMD_SERIALIZE_KERNEL`.
+
+### hipBLASLt
+
+* **New GemmTuning extension parameter** GemmTuning allows you to set a split-k value for each solution, which is more feasible for
+  performance tuning.
+
+### hipFFT
+
+* **New multi-GPU support for single-process transforms** Multiple GPUs can be used to perform a transform in a single process. Note that this initial
+  implementation is a functional preview.
+
+### HIPIFY
+
+* **Skipped code blocks**: Code blocks that are skipped by the preprocessor are no longer hipified under the
+  `--default-preprocessor` option. To hipify everything, despite conditional preprocessor directives
+  (`#if`, `#ifdef`, `#ifndef`, `#elif`, or `#else`), don't use the `--default-preprocessor` or `--amap` options.
+
+### hipSPARSELt
+
+* **Structured sparsity matrix support extensions**
+  Structured sparsity matrices help speed up deep-learning workloads. We now support `B` as the
+  sparse matrix and `A` as the dense matrix in Sparse Matrix-Matrix Multiplication (SPMM). Prior to this
+  release, we only supported sparse (matrix A) x dense (matrix B) matrix multiplication. Structured
+  sparsity matrices help speed up deep learning workloads.
+
+### hipTensor
+
+* **4D tensor permutation and contraction support**.
+  You can now perform tensor permutation on 4D tensors and 4D contractions for F16, BF16, and
+  Complex F32/F64 datatypes.
+
+### MIGraphX
+
+* **Improved performance for transformer-based models**.
+  We added support for FlashAttention, which benefits models like BERT, GPT, and Stable Diffusion.
+
+* **New Torch-MIGraphX driver**.
+  This driver calls MIGraphX directly from PyTorch. It provides an `mgx_module` object that you can
+  invoke like any other Torch module, but which utilizes the MIGraphX inference engine internally.
+  Torch-MIGraphX supports FP32, FP16, and INT8 datatypes.
+
+  * **FP8 support**. We now offer functional support for inference in the FP8E4M3FNUZ datatype. You
+  can load an ONNX model in FP8E4M3FNUZ using C++ or Python APIs, or `migraphx-driver`.
+  You can quantize a floating point model to FP8 format by using the `--fp8` flag with `migraphx-driver`.
+  To accelerate inference, MIGraphX uses hardware acceleration on MI300 for FP8 by leveraging FP8
+  support in various backend kernel libraries.
+
+### MIOpen
+
+* **Improved performance for inference and convolutions**.
+  Inference support now provided for Find 2.0 fusion plans. Additionally, we've enhanced the Number of
+  samples, Height, Width, and Channels (NHWC) convolution kernels for heuristics. NHWC stores data
+  in a format where the height and width dimensions come first, followed by channels.
+
+### OpenMP
+
+* **Implicit Zero-copy is triggered automatically in XNACK-enabled MI300A systems**.
+  Implicit Zero-copy behavior in `non unified_shared_memory` programs is triggered automatically in
+  XNACK-enabled MI300A systems (for example, when using the `HSA_XNACK=1` environment
+  variable). OpenMP supports the 'requires `unified_shared_memory`' directive to support programs
+  that don’t want to copy data explicitly between the CPU and GPU. However, this requires that you add
+  these directives to every translation unit of the program.
+
+* **New MI300 FP atomics**. Application performance can now improve by leveraging fast floating-point atomics on MI300 (gfx942).
+  
+
+### RCCL
+
+* **NCCL 2.18.6 compatibility**.
+  RCCL is now compatible with NCCL 2.18.6, which includes increasing the maximum IB network interfaces to 32 and fixing network device ordering when creating communicators with only one GPU
+  per node.
+
+* **Doubled simultaneous communication channels**.
+  We improved MI300X performance by increasing the maximum number of simultaneous
+  communication channels from 32 to 64.
+
+### rocALUTION
+
+* **New multiple node and GPU support**.
+  Unsmoothed and smoothed aggregations and Ruge-Stueben AMG now work with multiple nodes
+  and GPUs. For more information, refer to the 
+  [API documentation](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/usermanual/solvers.html#unsmoothed-aggregation-amg).
+
+### rocDecode
+
+* **New ROCm component**.
+  rocDecode ROCm's newest component, providing high-performance video decode support for AMD
+  GPUs. To learn more, refer to the 
+  [documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
+
+### ROCm Compiler
+
+* **Combined projects**. ROCm Device-Libs, ROCm Compiler Support, and hipCC are now located in
+  the `llvm-project/amd` subdirectory of AMD's fork of the LLVM project. Previously, these projects
+  were maintained in separate repositories. Note that the projects themselves will continue to be
+  packaged separately.
+
+* **Split the 'rocm-llvm' package**. This package has been split into a required and an optional package: 
+
+  * **rocm-llvm(required)**: A package containing the essential binaries needed for compilation.
+  
+  * **rocm-llvm-dev(optional)**: A package containing binaries for compiler and application developers.
+    
+
+### ROCm Data Center Tool (RDC)
+
+* **C++ upgrades**.
+  RDC was upgraded from C++11 to C++17 to enable a more modern C++ standard when writing RDC plugins.
+
+### ROCm Performance Primitives (RPP)
+
+* **New backend support**.
+  Audio processing support added for the `HOST` backend and 3D Voxel kernels support
+  for the `HOST` and `HIP` backends.
+
+### ROCm Validation Suite
+
+* **New datatype support**.
+Added BF16 and FP8 datatypes based on General Matrix Multiply(GEMM) operations in the GPU Stress Test (GST) module. This provides additional performance benchmarking and stress testing based on the newly supported datatypes.
+
+### rocSOLVER
+
+* **New EigenSolver routine**.
+Based on the Jacobi algorithm, a new EigenSolver routine was added to the library. This routine computes the eigenvalues and eigenvectors of a matrix with improved performance.
+
+### ROCTracer
+
+* **New versioning and callback enhancements**.
+Improved to match versioning changes in HIP Runtime and supports runtime API callbacks and activity record logging. The APIs of different runtimes at different levels are considered different API domains with assigned domain IDs.
+
+## Upcoming changes
+
+* ROCm SMI will be deprecated in a future release. We advise **migrating to AMD SMI** now to
+  prevent future workflow disruptions.
+
+* hipCC supports, by default, the following compiler invocation flags:
+
+  * `-mllvm -amdgpu-early-inline-all=true`
+  * `-mllvm -amdgpu-function-calls=false`
+
+  In a future ROCm release, hipCC will no longer support these flags. It will, instead, use the Clang
+  defaults:
+
+  * `-mllvm -amdgpu-early-inline-all=false`
+  * `-mllvm -amdgpu-function-calls=true`
+
+  To evaluate the impact of this change, include `--hipcc-func-supp` in your hipCC invocation.
+
+  For information on these flags, and the differences between hipCC and Clang, refer to
+  [ROCm Compiler Interfaces](https://rocm.docs.amd.com/en/latest/reference/rocmcc.html#rocm-compiler-interfaces).
+
+*  Future ROCm releases will not provide `clang-ocl`. For more information, refer to the
+  [`clang-ocl` README](https://github.com/ROCm/clang-ocl).
+
+* The following operating systems will be supported in a future ROCm release. They are currently
+  only available in beta.
+
+  * RHEL 9.4
+  * RHEL 8.10
+  * SLES 15 SP6
+
+* As of ROCm 6.2, we’ve planned for **end-of-support** for:
+
+  * Ubuntu 20.04.5
+  * SLES 15 SP4
+  * RHEL/CentOS 7.9
--- a/cmake/Modules/Dependencies.cmake
+++ b/cmake/Modules/Dependencies.cmake
@@ -0,0 +1,47 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ###########################
+# ROCm dependencies
+# ###########################
+
+include(FetchContent)
+
+if(BUILD_DOCS)
+  find_package(ROCM 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}") # First version with Sphinx doc gen improvement
+  if(NOT ROCM_FOUND)
+    message(STATUS "ROCm CMake not found. Fetching...")
+    set(rocm_cmake_tag
+      "c044bb52ba85058d28afe2313be98d9fed02e293" # develop@2023.09.12. (move to 6.0 tag when released)
+      CACHE STRING "rocm-cmake tag to download")
+    FetchContent_Declare(
+      rocm-cmake
+      GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
+      GIT_TAG        ${rocm_cmake_tag}
+      SOURCE_SUBDIR "DISABLE ADDING TO BUILD" # We don't really want to consume the build and test targets of ROCm CMake.
+    )
+    FetchContent_MakeAvailable(rocm-cmake)
+    find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
+  else()
+    find_package(ROCM 0.11.0 CONFIG REQUIRED PATHS "${ROCM_PATH}")
+  endif()
+endif()
--- a/default.xml
+++ b/default.xml
@@ -1,79 +1,71 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
-    <remote name="roc-github"
-fetch="https://github.com/RadeonOpenCompute/" />
-    <remote name="rocm-devtools"
-fetch="https://github.com/ROCm-Developer-Tools/" />
-    <remote name="rocm-swplat"
-fetch="https://github.com/ROCmSoftwarePlatform/" />
-    <remote name="gpuopen-libs"
-fetch="https://github.com/GPUOpen-ProfessionalCompute-Libraries/" />
-    <remote name="gpuopen-tools"
-fetch="https://github.com/GPUOpen-Tools/" />
-    <remote name="KhronosGroup"
-fetch="https://github.com/KhronosGroup/" />
-    <default revision="refs/tags/rocm-5.7.1"
-     remote="roc-github"
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <remote name="KhronosGroup" fetch="https://github.com/KhronosGroup/" />
+    <default revision="refs/tags/rocm-6.1.0"
+     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
-<!--list of projects for ROCM-->
+<!--list of projects for ROCm-->
+    <project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" />
    <project name="ROCK-Kernel-Driver" />
-    <project name="ROCT-Thunk-Interface" />
    <project name="ROCR-Runtime" />
+    <project name="ROCT-Thunk-Interface" />
    <project name="amdsmi" />
-    <project name="rocm_smi_lib" />
-    <project name="rocm-core" />
-    <project name="rocm-cmake" />
-    <project name="rocminfo" />
-    <project name="rocm_bandwidth_test" />
-    <project name="rocprofiler" remote="rocm-devtools" />
-    <project name="roctracer" remote="rocm-devtools" />
-    <project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" revision="6c03f8b58fafd9dd693eaac826749a5cfad515f8" />
    <project name="clang-ocl" />
    <project name="rdc" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="roctracer" />
 <!--HIP Projects-->
-    <project name="HIP" remote="rocm-devtools" />
-    <project name="HIP-Examples" remote="rocm-devtools" />
-    <project name="clr" remote="rocm-devtools" />
-    <project name="HIPIFY" remote="rocm-devtools" />
-    <project name="HIPCC" remote="rocm-devtools" />
+    <project name="HIP" />
+    <project name="HIP-Examples" />
+    <project name="HIPIFY" />
+    <project name="clr" />
+    <project name="hipother" />
 <!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
    <project name="llvm-project" />
-    <project name="ROCm-Device-Libs" />
-    <project name="ROCm-CompilerSupport" />
-    <project name="half" remote="rocm-swplat" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
 <!-- gdb projects -->
-    <project name="ROCgdb" remote="rocm-devtools" />
-    <project name="ROCdbgapi" remote="rocm-devtools" />
-    <project name="rocr_debug_agent" remote="rocm-devtools" />
+    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
+    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
-    <project groups="mathlibs" name="rocBLAS" remote="rocm-swplat" />
-    <project groups="mathlibs" name="Tensile" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipTensor" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipBLAS" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocFFT" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipFFT" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocRAND" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocSPARSE" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocSOLVER" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipSOLVER" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipSPARSE" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocALUTION" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocThrust" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipCUB" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocPRIM" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocWMMA" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rccl" remote="rocm-swplat" />
-    <project name="rocMLIR" remote="rocm-swplat" />
-    <project name="MIOpen" remote="rocm-swplat" />
-    <project name="composable_kernel" remote="rocm-swplat" />
-    <project name="MIVisionX" remote="gpuopen-libs" />
-    <project name="rpp" remote="gpuopen-libs" />
-    <project name="hipfort" remote="rocm-swplat" />
-    <project name="AMDMIGraphX" remote="rocm-swplat" />
-    <project name="ROCmValidationSuite" remote="rocm-devtools" />
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIOpen" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="Tensile" />
+    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipBLAS" />
+    <project groups="mathlibs" name="hipBLASLt" />
+    <project groups="mathlibs" name="hipCUB" />
+    <project groups="mathlibs" name="hipFFT" />
+    <project groups="mathlibs" name="hipRAND" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipSPARSE" />
+    <project groups="mathlibs" name="hipSPARSELt" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
+    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocBLAS" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="rocPRIM" />
+    <project groups="mathlibs" name="rocRAND" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocSPARSE" />
+    <project groups="mathlibs" name="rocThrust" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
 <!-- Projects for OpenMP-Extras -->
-    <project name="aomp" path="openmp-extras/aomp" remote="rocm-devtools" />
-    <project name="aomp-extras" path="openmp-extras/aomp-extras" remote="rocm-devtools" />
-    <project name="flang" path="openmp-extras/flang" remote="rocm-devtools" />
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
 </manifest>
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,33 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+include(ROCMSphinxDoc)
+
+rocm_add_sphinx_doc(
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+  OUTPUT_DIR html
+  BUILDER html
+)
+  
+install(
+  DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html"
+  DESTINATION "${CMAKE_INSTALL_DOCDIR}")
--- a/docs/about/compatibility/3rd-party-support-matrix.md
+++ b/docs/about/compatibility/3rd-party-support-matrix.md
@@ -1,63 +0,0 @@
-# Third party support matrix
-
-ROCm™ supports various 3rd party libraries and frameworks. Supported versions
-are tested and known to work. Non-supported versions of 3rd parties may also
-work, but aren't tested.
-
-## Deep learning
-
-ROCm releases support the most recent and two prior releases of PyTorch and
-TensorFlow.
-
-| ROCm  | [PyTorch](https://github.com/pytorch/pytorch/releases/) | [TensorFlow](https://github.com/tensorflow/tensorflow/releases/) |
-|:------|:--------------------------:|:--------------------:|
-| 5.0.2 | 1.8,  1.9,  1.10           | 2.6, 2.7, 2.8        |
-| 5.1.3 | 1.9,  1.10, 1.11           | 2.7, 2.8, 2.9        |
-| 5.2.x | 1.10, 1.11, 1.12           | 2.8, 2.9, 2.9        |
-| 5.3.x | 1.10.1, 1.11, 1.12.1, 1.13 | 2.8, 2.9, 2.10       |
-| 5.4.x | 1.10.1, 1.11, 1.12.1, 1.13 | 2.8, 2.9, 2.10, 2.11 |
-| 5.5.x | 1.10.1, 1.11, 1.12.1, 1.13 | 2.10, 2.11, 2.13     |
-| 5.6.x | 1.12.1, 1.13, 2.0          | 2.12, 2.13           |
-| 5.7.x | 1.12.1, 1.13, 2.0          | 2.12, 2.13           |
-
-(communication-libraries)=
-
-## Communication libraries
-
-ROCm supports [OpenUCX](https://openucx.org/), an open-source,
-production-grade communication framework for data-centric and high performance
-applications.
-
-UCX version | ROCm 5.4 and older | ROCm 5.5 and newer |
-|:----------|:------------------:|:------------------:|
-| -1.14.0   | COMPATIBLE         | INCOMPATIBLE       |
-|  1.14.1+  | COMPATIBLE         | COMPATIBLE         |
-
-The Unified Collective Communication ([UCC](https://github.com/openucx/ucc)) library also has
-support for ROCm devices.
-
-UCC version | ROCm 5.5 and older | ROCm 5.6 and newer |
-|:----------|:------------------:|:------------------:|
-| -1.1.0    | COMPATIBLE         | INCOMPATIBLE       |
-|  1.2.0+   | COMPATIBLE         | COMPATIBLE         |
-
-## Algorithm libraries
-
-ROCm releases provide algorithm libraries with interfaces compatible with
-contemporary CUDA / NVIDIA HPC SDK alternatives.
-
-* Thrust → rocThrust
-* CUB → hipCUB
-
-| ROCm  | Thrust / CUB | HPC SDK |
-|:------|:------------:|:-------:|
-| 5.0.2 | 1.14         | 21.9       |
-| 5.1.3 | 1.15         | 22.1       |
-| 5.2.x | 1.15         | 22.2, 22.3 |
-| 5.3.x | 1.16         | 22.7       |
-| 5.4.x | 1.16         | 22.9       |
-| 5.5.x | 1.17         | 22.9       |
-| 5.6.x | 1.17.2       | 22.9       |
-| 5.7.x | 1.17.2       | 22.9       |
-
-For the latest documentation of these libraries, refer to [API libraries](../../reference/library-index.md).
--- a/docs/about/compatibility/docker-image-support-matrix.rst
+++ b/docs/about/compatibility/docker-image-support-matrix.rst
@@ -1,130 +0,0 @@
-******************************************************************
-Docker image support matrix
-******************************************************************
-
-AMD validates and publishes `PyTorch <https://hub.docker.com/r/rocm/pytorch>`_ and
-`TensorFlow <https://hub.docker.com/r/rocm/tensorflow>`_ containers on dockerhub. The following
-tags, and associated inventories, are validated with ROCm 5.7.
-
-.. tab-set::
-
-    .. tab-item:: PyTorch
-
-        .. tab-set::
-
-            .. tab-item:: Ubuntu 22.04
-
-                Tag: `rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1 <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1/images/sha256-21df283b1712f3d73884b9bc4733919374344ceacb694e8fbc2c50bdd3e767ee>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.10 <https://www.python.org/downloads/release/python-31013/>`_
-                    * `Torch 2.0.1 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/release/2.0>`_
-                    * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                    * `Torchvision 0.15.0 <https://github.com/pytorch/vision/tree/release/0.15>`_
-                    * `Tensorboard 2.14.0 <https://github.com/tensorflow/tensorboard/tree/2.14>`_
-                    * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-                    * `UCX 1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-                    * `OMPI 4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
-                    * `OFED 5.4.3 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-            .. tab-item:: Ubuntu 20.04
-
-                Tag: `rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_staging <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1/images/sha256-4dd86046e5f777f53ae40a75ecfc76a5e819f01f3b2d40eacbb2db95c2f971d4)>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `Torch 2.1.0 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/rocm5.7_internal_testing>`_
-                    * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                    * `Torchvision 0.16.0 <https://github.com/pytorch/vision/tree/release/0.16>`_
-                    * `Tensorboard 2.14.0 <https://github.com/tensorflow/tensorboard/tree/2.14>`_
-                    * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-                    * `UCX 1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-                    * `OMPI 4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
-                    * `OFED 5.4.3 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-
-                Tag: `Ubuntu rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_1.12.1 <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_ubuntu20.04_py3.9_pytorch_1.12.1/images/sha256-e67db9373c045a7b6defd43cc3d067e7d49fd5d380f3f8582d2fb219c1756e1f>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `Torch 1.12.1 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/release/1.12>`_
-                    * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                    * `Torchvision 0.13.1 <https://github.com/pytorch/vision/tree/v0.13.1>`_
-                    * `Tensorboard 2.14.0 <https://github.com/tensorflow/tensorboard/tree/2.14>`_
-                    * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-                    * `UCX 1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-                    * `OMPI 4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
-                    * `OFED 5.4.3 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-                Tag: `Ubuntu rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_1.13.1 <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_ubuntu20.04_py3.9_pytorch_1.13.1/images/sha256-ed99d159026093d2aaf5c48c1e4b0911508773430377051372733f75c340a4c1>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `Torch 1.12.1 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/release/1.13>`_
-                    * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                    * `Torchvision 0.14.0 <https://github.com/pytorch/vision/tree/v0.14.0>`_
-                    * `Tensorboard 2.12.0 <https://github.com/tensorflow/tensorboard/tree/2.12.0>`_
-                    * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-                    * `UCX 1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-                    * `OMPI 4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
-                    * `OFED 5.4.3 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-                Tag: `Ubuntu rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1 <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1/images/sha256-4dd86046e5f777f53ae40a75ecfc76a5e819f01f3b2d40eacbb2db95c2f971d4>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `Torch 2.0.1 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/release/2.0>`_
-                    * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                    * `Torchvision 0.15.2 <https://github.com/pytorch/vision/tree/release/0.15>`_
-                    * `Tensorboard 2.14.0 <https://github.com/tensorflow/tensorboard/tree/2.14>`_
-                    * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-                    * `UCX 1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-                    * `OMPI 4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
-                    * `OFED 5.4.3 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
-
-            .. tab-item:: CentOS 7
-
-                Tag: `rocm/pytorch:rocm5.7_centos7_py3.9_pytorch_staging <https://hub.docker.com/layers/rocm/pytorch/rocm5.7_centos7_py3.9_pytorch_staging/images/sha256-92240cdf0b4aa7afa76fc78be995caa19ee9c54b5c9f1683bdcac28cedb58d2b>`_
-
-                * Inventory:
-
-                * `ROCm 5.7 <https://repo.radeon.com/rocm/yum/5.7/>`_
-                * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                * `Torch 2.1.0 <https://github.com/ROCmSoftwarePlatform/pytorch/tree/rocm5.7_internal_testing>`_
-                * `Apex 0.1 <https://github.com/ROCmSoftwarePlatform/apex/tree/v0.1>`_
-                * `Torchvision 0.16.0 <https://github.com/pytorch/vision/tree/release/0.16>`_
-                * `MAGMA <https://bitbucket.org/icl/magma/src/master/>`_
-
-    .. tab-item:: TensorFlow
-
-        .. tab-set::
-
-            .. tab-item:: Ubuntu 20.04
-
-                Tag: `rocm5.7-tf2.12-dev <https://hub.docker.com/layers/rocm/tensorflow/rocm5.7-tf2.12-dev/images/sha256-e0ac4d49122702e5167175acaeb98a79b9500f585d5e74df18facf6b52ce3e59>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `tensorflow-rocm 2.12.1 <https://pypi.org/project/tensorflow-rocm/2.12.1.570/>`_
-                    * `Tensorboard 2.12.3 <https://github.com/tensorflow/tensorboard/tree/2.12>`_
-
-                Tag: `rocm5.7-tf2.13-dev <https://hub.docker.com/layers/rocm/tensorflow/rocm5.7-tf2.13-dev/images/sha256-6f995539eebc062aac2b53db40e2b545192d8b032d0deada8c24c6651a7ac332>`_
-
-                * Inventory:
-
-                    * `ROCm 5.7 <https://repo.radeon.com/rocm/apt/5.7/>`_
-                    * `Python 3.9 <https://www.python.org/downloads/release/python-3918/>`_
-                    * `tensorflow-rocm 2.13.0 <https://pypi.org/project/tensorflow-rocm/2.13.0.570/>`_
-                    * `Tensorboard 2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
--- a/docs/about/compatibility/linux-support.md
+++ b/docs/about/compatibility/linux-support.md
@@ -1,116 +0,0 @@
-# GPU and OS support (Linux)
-
-(linux-support)=
-
-## Supported Linux distributions
-
-AMD ROCm™ Platform supports the following Linux distributions.
-
-::::{tab-set}
-
-:::{tab-item} Supported
-
-| Distribution | Processor Architectures | Validated Kernel | Support |
-| :----------- | :---------------------: | :--------------: | ------: |
-| RHEL 9.2       | x86-64 | 5.14 (5.14.0-284.11.1.el9_2.x86_64)        | ✅ |
-| RHEL 9.1       | x86-64 | 5.14.0-284.11.1.el9_2.x86_64             | ✅ |
-| RHEL 8.8       | x86-64 | 4.18.0-477.el8.x86_64        | ✅ |
-| RHEL 8.7       | x86-64 | 4.18.0-425.10.1.el8_7.x86_64              | ✅ |
-| SLES 15 SP5    | x86-64 |  5.14.21-150500.53-default       | ✅ |
-| SLES 15 SP4    | x86-64 | 5.14.21-150400.24.63-default               | ✅ |
-| Ubuntu 22.04.2 | x86-64 | 5.19.0-45-generic | ✅ |
-| Ubuntu 20.04.5 | x86-64 | 5.15.0-75-generic          | ✅ |
-
-:::{versionadded} 5.6
-
-* RHEL 8.8 and 9.2 support is added.
-* SLES 15 SP5 support is added
-
-:::
-
-:::{tab-item} Unsupported
-
-| Distribution | Processor Architectures | Validated Kernel | Support |
-| :----------- | :---------------------: | :--------------: | ------: |
-| RHEL 9.0       | x86-64 | 5.14               | ❌ |
-| RHEL 8.6       | x86-64 | 5.14               | ❌ |
-| SLES 15 SP3    | x86-64 | 5.3                | ❌ |
-| Ubuntu 22.04.0 | x86-64 | 5.15 LTS, 5.17 OEM | ❌ |
-| Ubuntu 20.04.4 | x86-64 | 5.13 HWE, 5.13 OEM | ❌ |
-| Ubuntu 22.04.1 | x86-64 | 5.15 LTS           | ❌ |
-
-:::
-
-::::
-
-✅: **Supported** - AMD performs full testing of all ROCm components on distro
-  GA image.
-❌: **Unsupported** - AMD no longer performs builds and testing on these
-  previously supported distro GA images.
-
-## Virtualization support
-
-ROCm supports virtualization for select GPUs only as shown below.
-
-| Hypervisor     | Version  | GPU   | Validated Guest OS (validated kernel)                                            |
-|----------------|----------|-------|----------------------------------------------------------------------------------|
-| VMWare         | ESXi 8   | MI250 | Ubuntu 20.04 (`5.15.0-56-generic`)                                               |
-| VMWare         | ESXi 8   | MI210 | Ubuntu 20.04 (`5.15.0-56-generic`), SLES 15 SP4 (`5.14.21-150400.24.18-default`) |
-| VMWare         | ESXi 7   | MI210 | Ubuntu 20.04 (`5.15.0-56-generic`), SLES 15 SP4 (`5.14.21-150400.24.18-default`) |
-
-## Linux-supported GPUs
-
-The table below shows supported GPUs for Instinct™, Radeon Pro™ and Radeon™
-GPUs. Please click the tabs below to switch between GPU product lines. If a GPU
-is not listed on this table, the GPU is not officially supported by AMD.
-
-:::::{tab-set}
-
-::::{tab-item} AMD Instinct™
-:sync: instinct
-
-| Product Name | Architecture | [LLVM Target](https://www.llvm.org/docs/AMDGPUUsage.html#processors) |Support |
-|:------------:|:------------:|:--------------------------------------------------------------------:|:-------:|
-| AMD Instinct™ MI250X | CDNA2  | gfx90a | ✅ |
-| AMD Instinct™ MI250  | CDNA2  | gfx90a | ✅ |
-| AMD Instinct™ MI210  | CDNA2  | gfx90a | ✅ |
-| AMD Instinct™ MI100  | CDNA   | gfx908 | ✅ |
-| AMD Instinct™ MI50   | GCN5.1 | gfx906 | ✅ |
-| AMD Instinct™ MI25   | GCN5.0 | gfx900 | ❌ |
-
-::::
-
-::::{tab-item} Radeon Pro™
-:sync: radeonpro
-
-| Name | Architecture |[LLVM Target](https://www.llvm.org/docs/AMDGPUUsage.html#processors) | Support|
-|:----:|:------------:|:--------------------------------------------------------------------:|:-------:|
-| AMD Radeon™ Pro W7900   | RDNA3  | gfx1100 | ✅ (Ubuntu 22.04 only)|
-| AMD Radeon™ Pro W6800   | RDNA2  | gfx1030 | ✅ |
-| AMD Radeon™ Pro V620    | RDNA2  | gfx1030 | ✅ |
-| AMD Radeon™ Pro VII     | GCN5.1 | gfx906  | ✅ |
-::::
-
-::::{tab-item} Radeon™
-:sync: radeonpro
-
-| Name | Architecture    |[LLVM Target](https://www.llvm.org/docs/AMDGPUUsage.html#processors) | Support|
-|:----:|:---------------:|:--------------------------------------------------------------------:|:-------:|
-| AMD Radeon™ RX 7900 XTX | RDNA3 | gfx1100  | ✅ (Ubuntu 22.04 only)|
-| AMD Radeon™ VII        | GCN5.1 | gfx906  | ✅ |
-
-::::
-:::::
-
-### Support status
-
-✅: **Supported** - AMD enables these GPUs in our software distributions for
-  the corresponding ROCm product.
-⚠️: **Deprecated** - Support will be removed in a future release.
-❌: **Unsupported** - This configuration is not enabled in our software
-  distributions.
-
-## CPU support
-
-ROCm requires CPUs that support PCIe™ atomics. Modern CPUs after the release of
-1st generation AMD Zen CPU and Intel™ Haswell support PCIe atomics.
--- a/docs/about/compatibility/openmp.md
+++ b/docs/about/compatibility/openmp.md
@@ -1,3 +1,9 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="OpenMP support in ROCm">
+  <meta name="keywords" content="OpenMP, LLVM, OpenMP toolchain">
+</head>
+
 # OpenMP support in ROCm

 ## Introduction
@@ -9,7 +15,8 @@ Along with host APIs, the OpenMP compilers support offloading code and data onto
 GPU devices. This document briefly describes the installation location of the
 OpenMP toolchain, example usage of device offloading, and usage of `rocprof`
 with OpenMP applications. The GPUs supported are the same as those supported by
-this ROCm release. See the list of supported GPUs for [Linux](../../about/compatibility/linux-support.md) and [Windows](../../about/compatibility/windows-support.md).
+this ROCm release. See the list of supported GPUs for {doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
+{doc}`Windows<rocm-install-on-windows:reference/system-requirements>`.

 The ROCm OpenMP compiler is implemented using LLVM compiler technology.
 The following image illustrates the internal steps taken to translate a user’s application into an executable that can offload computation to the AMDGPU. The compilation is a two-pass process. Pass 1 compiles the application to generate the CPU code and Pass 2 links the CPU code to the AMDGPU device code.
@@ -41,10 +48,10 @@ cd $ROCM_PATH/share/openmp-extras/examples/openmp/veccopy
 sudo make run
 ```

-```{note}
+:::{note}
 `sudo` is required since we are building inside the `/opt` directory.
 Alternatively, copy the files to your home directory first.
-```
+:::

 The above invocation of Make compiles and runs the program. Note the options
 that are required for target offload from an OpenMP program:
@@ -53,13 +60,15 @@ that are required for target offload from an OpenMP program:
 -fopenmp --offload-arch=<gpu-arch>
 ```

-```{note}
+:::{note}
 The compiler also accepts the alternative offloading notation:

 ```bash
 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=<gpu-arch>
 ```

+:::
+
 Obtain the value of `gpu-arch` by running the following command:

 ```bash
@@ -69,7 +78,7 @@ Obtain the value of `gpu-arch` by running the following command:
 [//]: # (dated link below, needs updating)

 See the complete list of compiler command-line references
-[here](https://github.com/RadeonOpenCompute/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst).
+[here](https://github.com/ROCm/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst).

 ### Using `rocprof` with OpenMP

@@ -321,10 +330,10 @@ double a = 0.0;
 a = a + 1.0;
 ```

-```{note}
+:::{note}
 `AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and
 `AMD_safe_fp_atomics` is implemented with a compare-and-swap loop.
-```
+:::

 To disable the generation of fast floating-point atomic instructions at the file
 level, build using the option `-msafe-fp-atomics` or use a hint clause on a
@@ -404,7 +413,7 @@ void  main() {
 ```

 See the complete sample code for heap buffer overflow
-[here](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/examples/tools/asan/heap_buffer_overflow/openmp/vecadd-HBO.cpp).
+[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/heap_buffer_overflow/openmp/vecadd-HBO.cpp).

 * Global buffer overflow

@@ -429,7 +438,7 @@ for(int i=0; i<N; i++){
 ```

 See the complete sample code for global buffer overflow
-[here](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/examples/tools/asan/global_buffer_overflow/openmp/vecadd-GBO.cpp).
+[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/global_buffer_overflow/openmp/vecadd-GBO.cpp).

 ### Clang compiler option for kernel optimization

--- a/docs/about/compatibility/user-kernel-space-compat-matrix.md
+++ b/docs/about/compatibility/user-kernel-space-compat-matrix.md
@@ -1,24 +0,0 @@
-# User/kernel-space support matrix
-
-ROCm™ provides forward and backward compatibility between the Kernel Fusion
-Driver (KFD) and its user space software for +/- 2 releases. This table shows
-the compatibility combinations that are currently supported.
-
-| KFD   | Tested user space versions |
-|:------|:--------------------------:|
-| 5.0.2 | 5.1.0, 5.2.0               |
-| 5.1.0 | 5.0.2                      |
-| 5.1.3 | 5.2.0, 5.3.0               |
-| 5.2.0 | 5.0.2, 5.1.3               |
-| 5.2.3 | 5.3.0, 5.4.0               |
-| 5.3.0 | 5.1.3, 5.2.3               |
-| 5.3.3 | 5.4.0, 5.5.0               |
-| 5.4.0 | 5.2.3, 5.3.3               |
-| 5.4.3 | 5.5.0, 5.6.0               |
-| 5.4.4 | 5.5.0                      |
-| 5.5.0 | 5.3.3, 5.4.3               |
-| 5.5.1 | 5.6.0, 5.7.0               |
-| 5.6.0 | 5.4.3, 5.5.1               |
-| 5.6.1 | 5.7.0                      |
-| 5.7.0 | 5.5.0, 5.6.1               |
-| 5.7.1 | 5.5.0, 5.6.1               |
--- a/docs/about/compatibility/windows-support.md
+++ b/docs/about/compatibility/windows-support.md
@@ -1,80 +0,0 @@
-# GPU and OS support (Windows)
-
-(windows-support)=
-
-## Supported SKUs
-
-AMD HIP SDK supports the following Windows variants.
-
-| Distribution        |Processor Architectures| Validated update   |
-|---------------------|-----------------------|--------------------|
-| Windows 10          | x86-64                | 22H2 (GA)          |
-| Windows 11          | x86-64                | 22H2 (GA)          |
-| Windows Server 2022 | x86-64                |                    |
-
-## Windows-supported GPUs
-
-The table below shows supported GPUs for Radeon Pro™ and Radeon™ GPUs. Please
-click the tabs below to switch between GPU product lines. If a GPU is not listed
-on this table, the GPU is not officially supported by AMD.
-
-::::{tab-set}
-
-:::{tab-item} Radeon Pro™
-:sync: radeonpro
-
-| Name | Architecture |[LLVM Target](https://www.llvm.org/docs/AMDGPUUsage.html#processors) | Runtime | HIP SDK |
-|:----:|:------------:|:--------------------------------------------------------------------:|:-------:|:----------------:|
-| AMD Radeon Pro™ W7900   | RDNA3  | gfx1100 | ✅ | ✅ |
-| AMD Radeon Pro™ W7800   | RDNA3  | gfx1100 | ✅ | ✅ |
-| AMD Radeon Pro™ W6800   | RDNA2  | gfx1030 | ✅ | ✅ |
-| AMD Radeon Pro™ W6600   | RDNA2  | gfx1032 | ✅ | ❌ |
-| AMD Radeon Pro™ W5500   | RDNA1  | gfx1012 | ❌ | ❌ |
-| AMD Radeon Pro™ VII     | GCN5.1 | gfx906  | ❌ | ❌ |
-
-:::
-
-:::{tab-item} Radeon™
-:sync: radeon
-
-| Name | Architecture | [LLVM Target](https://www.llvm.org/docs/AMDGPUUsage.html#processors) | Runtime | HIP SDK |
-|:----:|:------------:|:--------------------------------------------------------------------:|:-------:|:----------------:|
-| AMD Radeon™ RX 7900 XTX | RDNA3  | gfx1100 | ✅ | ✅ |
-| AMD Radeon™ RX 7900 XT  | RDNA3  | gfx1100 | ✅ | ✅ |
-| AMD Radeon™ RX 7600     | RDNA3  | gfx1102 | ✅ | ✅ |
-| AMD Radeon™ RX 6950 XT  | RDNA2  | gfx1030 | ✅ | ✅ |
-| AMD Radeon™ RX 6900 XT  | RDNA2  | gfx1030 | ✅ | ✅ |
-| AMD Radeon™ RX 6800 XT  | RDNA2  | gfx1030 | ✅ | ✅ |
-| AMD Radeon™ RX 6800     | RDNA2  | gfx1030 | ✅ | ✅ |
-| AMD Radeon™ RX 6750 XT  | RDNA2  | gfx1031 | ✅ | ❌ |
-| AMD Radeon™ RX 6700 XT  | RDNA2  | gfx1031 | ✅ | ❌ |
-| AMD Radeon™ RX 6700     | RDNA2  | gfx1031 | ✅ | ❌ |
-| AMD Radeon™ RX 6650 XT  | RDNA2  | gfx1032 | ✅ | ❌ |
-| AMD Radeon™ RX 6600 XT  | RDNA2  | gfx1032 | ✅ | ❌ |
-| AMD Radeon™ RX 6600     | RDNA2  | gfx1032 | ✅ | ❌ |
-
-:::
-
-::::
-
-### Component support
-
-ROCm components are described in [What is ROCm?](../../what-is-rocm.md) Support
-on Windows is provided with two levels on enablement.
-
-* **Runtime**: Runtime enables the use of the HIP and OpenCL runtimes only.
-* **HIP SDK**: Runtime plus additional components are listed in [Libraries](../../reference/library-index.md).
- Note that some math libraries are Linux exclusive.
-
-### Support status
-
-✅: **Supported** - AMD enables these GPUs in our software distributions for
-  the corresponding ROCm product.
-⚠️: **Deprecated** - Support will be removed in a future release.
-❌: **Unsupported** - This configuration is not enabled in our software
-  distributions.
-
-## CPU support
-
-ROCm requires CPUs that support PCIe™ atomics. Modern CPUs after the release of
-1st generation AMD Zen CPU and Intel™ Haswell support PCIe atomics.
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -1,9 +1,146 @@
-# License
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm licensing terms">
+  <meta name="keywords" content="license, licensing terms">
+</head>

-> Note: This license applies to the [ROCm repository](https://github.com/RadeonOpenCompute/ROCm) that primarily contains documentation. For other licensing information, refer to the [Licensing Terms page](./licensing).
+# ROCm license

 ```{include} ../../LICENSE
 ```

-```{include} ./licensing.md
+:::{note}
+The preceding license applies to the [ROCm repository](https://github.com/ROCm/ROCm), which
+primarily contains documentation. For licenses related to other ROCm components, refer to the
+following section.
+:::
+
+## ROCm component licenses
+
+ROCm is released by Advanced Micro Devices, Inc. (AMD) and is licensed per component separately.
+The following table is a list of ROCm components with links to their respective license
+terms. These components may include third party components subject to
+additional licenses. Please review individual repositories for more information.
+
+<!-- spellcheck-disable -->
+| Component | License |
+|:---------------------|:-------------------------|
+| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
+| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
+| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
+| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
+| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
+| [AMD Common Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/develop/LICENCE) |
+| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
+| [hipamd](https://github.com/ROCm/clr/tree/develop/hipamd) | [MIT](https://github.com/ROCm/clr/blob/develop/hipamd/LICENSE.txt) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/develop/opencl) | [MIT](https://github.com/ROCm/clr/blob/develop/opencl/LICENSE.txt) |
+| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
+| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
+| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
+| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
+| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
+| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
+| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
+| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
+| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
+| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
+| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
+| [ROCR Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
+| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
+| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
+| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
+| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
+| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
+| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
+| [hipFORT](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
+| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
+| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
+| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
+| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
+| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
+| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
+| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
+| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
+| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
+| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
+| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
+| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
+| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
+| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
+| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
+| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
+| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
+| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/develop/LICENSE) |
+| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
+| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
+| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/develop/License.txt) |
+| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/develop/LICENSE) |
+| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
+| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
+| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
+| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
+| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html)
+
+Open sourced ROCm components are released via public GitHub
+repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
+Proprietary products are only available on [https://repo.radeon.com](https://repo.radeon.com). Currently, only
+one component of ROCm, `rocm-llvm-alt` is governed by a proprietary license.
+Proprietary components are organized in a proprietary subdirectory in the package
+repositories to distinguish from open sourced packages.
+
+```{note}
+The following additional terms and conditions apply to your use of ROCm technical documentation.
 ```
+
+©2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+The information presented in this document is for informational purposes only
+and may contain technical inaccuracies, omissions, and typographical errors. The
+information contained herein is subject to change and may be rendered inaccurate
+for many reasons, including but not limited to product and roadmap changes,
+component and motherboard version changes, new model and/or product releases,
+product differences between differing manufacturers, software changes, BIOS
+flashes, firmware upgrades, or the like. Any computer system has risks of
+security vulnerabilities that cannot be completely prevented or mitigated. AMD
+assumes no obligation to update or otherwise correct or revise this information.
+However, AMD reserves the right to revise this information and to make changes
+from time to time to the content hereof without obligation of AMD to notify any
+person of such revisions or changes.
+
+THIS INFORMATION IS PROVIDED “AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES
+WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY
+INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD
+SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT,
+MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE
+LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER
+CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN,
+EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+AMD, the AMD Arrow logo, ROCm, and combinations thereof are trademarks of
+Advanced Micro Devices, Inc. Other product names used in this publication are
+for identification purposes only and may be trademarks of their respective
+companies.
+
+### Package licensing
+
+:::{attention}
+AQL Profiler and AOCC CPU optimization are both provided in binary form, each
+subject to the license agreement enclosed in the directory for the binary available
+in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
+copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
+the terms and conditions of this license agreement. If you do not agree to the
+terms of this agreement, do not install, copy or use the AQL Profiler and/or the
+AOCC CPU Optimizations.
+:::
+
+For the rest of the ROCm packages, you can find the licensing information at the
+following location: `/opt/rocm/share/doc/<component-name>/` or in the locations
+specified in the preceding table.
+
+For example, you can fetch the licensing information of the `amd_comgr`
+component (Code Object Manager) from the `/opt/rocm/share/doc/amd_comgr/LICENSE.txt` file.
--- a/docs/about/licensing.md
+++ b/docs/about/licensing.md
@@ -1,127 +0,0 @@
-# ROCm licensing terms
-
-ROCm™ is released by Advanced Micro Devices, Inc. and is licensed per component separately.
-The following table is a list of ROCm components with links to their respective license
-terms. These components may include third party components subject to
-additional licenses. Please review individual repositories for more information.
-
-The table shows ROCm components, the name of license, and link to the license terms.
-The table is ordered to follow the ROCm manifest file.
-
-<!-- spellcheck-disable -->
-| Component | License |
-|:---------------------|:-------------------------|
-| [AMDMIGraphX](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/) | [MIT](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/blob/develop/LICENSE) |
-| [HIPCC](https://github.com/ROCm-Developer-Tools/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm-Developer-Tools/HIPCC/blob/develop/LICENSE.txt) |
-| [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/) | [MIT](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/amd-staging/LICENSE.txt) |
-| [HIP](https://github.com/ROCm-Developer-Tools/HIP/) | [MIT](https://github.com/ROCm-Developer-Tools/HIP/blob/develop/LICENSE.txt) |
-| [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/) | [MIT](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/blob/master/LICENSE.txt) |
-| [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen/) | [MIT](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/LICENSE.txt) |
-| [MIVisionX](https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/) | [MIT](https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/blob/master/LICENSE.txt) |
-| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
-| [ROCK-Kernel-Driver](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [ROCR-Runtime](https://github.com/RadeonOpenCompute/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/LICENSE.txt) |
-| [ROCT-Thunk-Interface](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/) | [MIT](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
-| [ROCclr](https://github.com/ROCm-Developer-Tools/ROCclr/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCclr/blob/develop/LICENSE.txt) |
-| [ROCdbgapi](https://github.com/ROCm-Developer-Tools/ROCdbgapi/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCdbgapi/blob/amd-master/LICENSE.txt) |
-| [ROCgdb](https://github.com/ROCm-Developer-Tools/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm-Developer-Tools/ROCgdb/blob/amd-master/COPYING) |
-| [ROCm-CompilerSupport](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
-| [ROCm-Device-Libs](https://github.com/RadeonOpenCompute/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
-| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
-| [ROCm-OpenCL-Runtime](https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
-| [ROCmValidationSuite](https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/blob/master/LICENSE) |
-| [Tensile](https://github.com/ROCmSoftwarePlatform/Tensile/) | [MIT](https://github.com/ROCmSoftwarePlatform/Tensile/blob/develop/LICENSE.md) |
-| [aomp-extras](https://github.com/ROCm-Developer-Tools/aomp-extras/) | [MIT](https://github.com/ROCm-Developer-Tools/aomp-extras/blob/aomp-dev/LICENSE) |
-| [aomp](https://github.com/ROCm-Developer-Tools/aomp/) | [Apache 2.0](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/LICENSE) |
-| [atmi](https://github.com/RadeonOpenCompute/atmi/) | [MIT](https://github.com/RadeonOpenCompute/atmi/blob/master/LICENSE.txt) |
-| [clang-ocl](https://github.com/RadeonOpenCompute/clang-ocl/) | [MIT](https://github.com/RadeonOpenCompute/clang-ocl/blob/master/LICENSE) |
-| [flang](https://github.com/ROCm-Developer-Tools/flang/) | [Apache 2.0](https://github.com/ROCm-Developer-Tools/flang/blob/master/LICENSE.txt) |
-| [half](https://github.com/ROCmSoftwarePlatform/half/) | [MIT](https://github.com/ROCmSoftwarePlatform/half/blob/master/LICENSE.txt) |
-| [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipBLAS/blob/develop/LICENSE.md) |
-| [hipCUB](https://github.com/ROCmSoftwarePlatform/hipCUB/) | [Custom](https://github.com/ROCmSoftwarePlatform/hipCUB/blob/develop/LICENSE.txt) |
-| [hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipFFT/blob/develop/LICENSE.md) |
-| [hipSOLVER](https://github.com/ROCmSoftwarePlatform/hipSOLVER/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSOLVER/blob/develop/LICENSE.md) |
-| [hipSPARSELt](https://github.com/ROCmSoftwarePlatform/hipSPARSELt/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSPARSELt/blob/develop/LICENSE.md) |
-| [hipSPARSE](https://github.com/ROCmSoftwarePlatform/hipSPARSE/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/LICENSE.md) |
-| [hipTensor](https://github.com/ROCmSoftwarePlatform/hipTensor) | [MIT](https://github.com/ROCmSoftwarePlatform/hipTensor/blob/develop/LICENSE) |
-| [hipamd](https://github.com/ROCm-Developer-Tools/hipamd/) | [MIT](https://github.com/ROCm-Developer-Tools/hipamd/blob/develop/LICENSE.txt) |
-| [hipfort](https://github.com/ROCmSoftwarePlatform/hipfort/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipfort/blob/master/LICENSE) |
-| [llvm-project](https://github.com/ROCm-Developer-Tools/llvm-project/) | [Apache](https://github.com/ROCm-Developer-Tools/llvm-project/blob/main/LICENSE.TXT) |
-| [rccl](https://github.com/ROCmSoftwarePlatform/rccl/) | [Custom](https://github.com/ROCmSoftwarePlatform/rccl/blob/develop/LICENSE.txt) |
-| [rdc](https://github.com/RadeonOpenCompute/rdc/) | [MIT](https://github.com/RadeonOpenCompute/rdc/blob/master/LICENSE) |
-| [rocALUTION](https://github.com/ROCmSoftwarePlatform/rocALUTION/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocALUTION/blob/develop/LICENSE.md) |
-| [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/LICENSE.md) |
-| [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocFFT/blob/develop/LICENSE.md) |
-| [rocPRIM](https://github.com/ROCmSoftwarePlatform/rocPRIM/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocPRIM/blob/develop/LICENSE.txt) |
-| [rocRAND](https://github.com/ROCmSoftwarePlatform/rocRAND/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocRAND/blob/develop/LICENSE.txt) |
-| [rocSOLVER](https://github.com/ROCmSoftwarePlatform/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/LICENSE.md) |
-| [rocSPARSE](https://github.com/ROCmSoftwarePlatform/rocSPARSE/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocSPARSE/blob/develop/LICENSE.md) |
-| [rocThrust](https://github.com/ROCmSoftwarePlatform/rocThrust/) | [Apache 2.0](https://github.com/ROCmSoftwarePlatform/rocThrust/blob/develop/LICENSE) |
-| [rocWMMA](https://github.com/ROCmSoftwarePlatform/rocWMMA/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocWMMA/blob/develop/LICENSE.md) |
-| [rocm-cmake](https://github.com/RadeonOpenCompute/rocm-cmake/) | [MIT](https://github.com/RadeonOpenCompute/rocm-cmake/blob/develop/LICENSE) |
-| [rocm_bandwidth_test](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/blob/master/LICENSE.txt) |
-| [rocm_smi_lib](https://github.com/RadeonOpenCompute/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/License.txt) |
-| [rocminfo](https://github.com/RadeonOpenCompute/rocminfo/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocminfo/blob/master/License.txt) |
-| [rocprofiler](https://github.com/ROCm-Developer-Tools/rocprofiler/) | [MIT](https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/LICENSE) |
-| [rocr_debug_agent](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/blob/master/LICENSE.txt) |
-| [roctracer](https://github.com/ROCm-Developer-Tools/roctracer/) | [MIT](https://github.com/ROCm-Developer-Tools/roctracer/blob/amd-master/LICENSE) |
-| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)
-
-Open sourced ROCm components are released via public GitHub
-repositories, packages on https://repo.radeon.com and other distribution channels.
-Proprietary products are only available on https://repo.radeon.com. Currently, only
-one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
-Proprietary components are organized in a proprietary subdirectory in the package
-repositories to distinguish from open sourced packages.
-
-The additional terms and conditions below apply to your use of ROCm technical
-documentation.
-
-©2023 Advanced Micro Devices, Inc. All rights reserved.
-
-The information presented in this document is for informational purposes only
-and may contain technical inaccuracies, omissions, and typographical errors. The
-information contained herein is subject to change and may be rendered inaccurate
-for many reasons, including but not limited to product and roadmap changes,
-component and motherboard version changes, new model and/or product releases,
-product differences between differing manufacturers, software changes, BIOS
-flashes, firmware upgrades, or the like. Any computer system has risks of
-security vulnerabilities that cannot be completely prevented or mitigated. AMD
-assumes no obligation to update or otherwise correct or revise this information.
-However, AMD reserves the right to revise this information and to make changes
-from time to time to the content hereof without obligation of AMD to notify any
-person of such revisions or changes.
-
-THIS INFORMATION IS PROVIDED “AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES
-WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY
-INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD
-SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT,
-MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE
-LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER
-CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN,
-EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-AMD, the AMD Arrow logo, ROCm, and combinations thereof are trademarks of
-Advanced Micro Devices, Inc. Other product names used in this publication are
-for identification purposes only and may be trademarks of their respective
-companies.
-
-## Package licensing
-
-```{attention}
-AQL Profiler and AOCC CPU optimization are both provided in binary form, each
-subject to the license agreement enclosed in the directory for the binary and is
-available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
-copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
-the terms and conditions of this license agreement. If you do not agree to the
-terms of this agreement, do not install, copy or use the AQL Profiler and/or the
-AOCC CPU Optimizations.
-```
-
-For the rest of the ROCm packages, you can find the licensing information at the
-following location: `/opt/rocm/share/doc/<component-name>/`
-
-For example, you can fetch the licensing information of the `_amd_comgr_`
-component (Code Object Manager) from the `amd_comgr` folder. A file named
-`LICENSE.txt` contains the license details at:
-`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`
--- a/docs/about/whats-new/whats-new.md
+++ b/docs/about/whats-new/whats-new.md
@@ -1,93 +0,0 @@
-# What's new in ROCm?
-
-ROCm is now supported on Windows.
-
-## Windows support
-
-Starting with ROCm 5.5, the HIP SDK brings a subset of ROCm to developers on Windows.
-The collection of features enabled on Windows is referred to as the HIP SDK.
-These features allow developers to use the HIP runtime, HIP math libraries
-and HIP Primitive libraries. The following table shows the differences
-between Windows and Linux releases.
-
-|Component|Linux|Windows|
-|---------|-----|-------|
-|Driver|Radeon Software for Linux |AMD Software Pro Edition|
-|Compiler|`hipcc`/`amdclang++`|`hipcc`/`clang++`|
-|Debugger|`rocgdb`|no debugger available|
-|Profiler|`rocprof`|[Radeon GPU Profiler](https://gpuopen.com/rgp/)|
-|Porting Tools|HIPIFY|Coming Soon|
-|Runtime|HIP (Open Sourced)|HIP (closed source)|
-|Math Libraries|Supported|Supported|
-|Primitives Libraries|Supported|Supported|
-|Communication Libraries|Supported|Not Available|
-|AI Libraries|MIOpen, MIGraphX|Not Available|
-|System Management|`rocm-smi-lib`, RDC, `rocminfo`|`amdsmi`, `hipInfo`|
-|AI Frameworks|PyTorch, TensorFlow, etc.|Not Available|
-|CMake HIP Language|Enabled|Unsupported|
-|Visual Studio| Not applicable| Plugin Available|
-|HIP Ray Tracing| Supported|Supported|
-
-AMD is continuing to invest in Windows support and AMD plans to release enhanced
-features in subsequent revisions.
-
-```{note}
-The 5.5 Windows Installer collectively groups the Math and Primitives
-libraries.
-```
-
-```{note}
-GPU support on Windows and Linux may differ. You must refer to
-Windows and Linux GPU support tables separately.
-```
-
-```{note}
-HIP Ray Tracing is not distributed via ROCm in Linux.
-```
-
-## ROCm release versioning
-
-Linux OS releases set the canonical version numbers for ROCm. Windows will
-follow Linux version numbers as Windows releases are based on Linux ROCm
-releases. However, not all Linux ROCm releases will have a corresponding Windows
-release. The following table shows the ROCm releases on Windows and Linux. Releases
-with both Windows and Linux are referred to as a joint release. Releases with
-only Linux support are referred to as a skipped release from the Windows
-perspective.
-
-|Release version|Linux|Windows|
-|---------------|-----|-------|
-|5.5|✅|✅|
-|5.6|✅|❌|
-
-ROCm Linux releases are versioned with following the Major.Minor.Patch
-version number system. Windows releases will only be versioned with Major.Minor.
-
-In general, Windows releases will trail Linux releases. Software developers that
-wish to support both Linux and Windows using a single ROCm version should
-refrain from upgrading ROCm unless there is a joint release.
-
-## Windows documentation implications
-
-The ROCm documentation website contains both Windows and Linux documentation.
-Just below each article title, a convenient article information section states
-whether the page applies to Linux only, Windows only or both OSes. To find the
-exact Windows documentation for a release of the HIP SDK, please view the ROCm documentation with the same
-Major.Minor version number while ignoring the Patch version. The Patch version
-only matters for Linux releases.  For convenience,
-Windows documentation will continue to be included in the overall ROCm
-documentation for the skipped Windows releases.
-
-Windows release notes will contain only information pertinent to Windows.
-The software developer must read all the previous ROCm release notes (including)
-skipped ROCm versions on Windows for information on all the changes present in
-the Windows release.
-
-## Windows builds from source
-
-Not all source code required to build Windows from source is available under a
-permissive open source license. Build instructions on Windows is only provided
-for projects that can be built from source on Windows using a toolchain that
-has closed source build prerequisites. The ROCm manifest file is not valid for
-Windows. AMD does not release a manifest or tag our components in Windows.
-Users may use corresponding Linux tags to build on Windows.
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -0,0 +1,127 @@
+.. meta::
+    :description: ROCm compatibility matrix
+    :keywords: AMD, GPU, architecture, hardware, compatibility, requirements
+
+**************************************************************************************
+Compatibility matrix
+**************************************************************************************
+
+Use this matrix to view the ROCm compatibility across successive major and minor releases.
+
+
+.. container:: format-big-table
+
+  .. csv-table:: 
+      :header: "ROCm Version", "6.1.0", "6.0.0"
+      :stub-columns: 1
+
+      :doc:`Operating Systems <rocm-install-on-linux:reference/system-requirements>`, "Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3"
+      ,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.4 [#red-hat94]_, 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,CentOS 7.9,CentOS 7.9
+      ,,
+      :doc:`GFX Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3
+      ,CDNA2,CDNA2
+      ,CDNA,CDNA
+      ,RDNA3,RDNA3
+      ,RDNA2,RDNA2
+      ,,
+      :doc:`GFX Card <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100
+      ,gfx1030,gfx1030
+      ,gfx942 [#]_, gfx942 [#]_
+      ,gfx90a,gfx90a
+      ,gfx908,gfx908
+      ,,
+      ECOSYSTEM SUPPORT:,,
+      :doc:`PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>`,"2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`,"2.15, 2.14, 2.13","2.14, 2.13, 2.12"
+      :doc:`JAX <rocm-install-on-linux:how-to/3rd-party/jax-install>`,0.4.26,0.4.26
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.17.3,1.14.1
+      ,,
+      3RD PARTY COMMUNICATION LIBS:,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.14.1,>=1.14.1
+      ,,
+      3RD PARTY ALGORITHM LIBS:,,
+      Thrust,2.1.0,2.0.1
+      CUB,2.1.0,2.0.1
+      ,,
+      ML & COMPUTER VISION LIBS:,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.9.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.1.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,2.5.0,2.5.0
+      :doc:`rocDecode <rocdecode:index>`,0.5.0,N/A
+      :doc:`ROCm Performance Primitives (RPP) <rpp:index>`,1.5.0,1.4.0
+      ,,
+      COMMUNICATION:,,
+      :doc:`RCCL <rccl:index>`,2.18.6,2.18.3
+      ,,
+      MATH LIBS:,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.1.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.7.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.14,1.0.13
+      :doc:`hipFORT <hipfort:index>`,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.1.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.0.1,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.1.1,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.1.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.27,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.0.1,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.25.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.1.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.4.0,1.3.0
+      `Tensile <https://github.com/ROCm/Tensile>`_,4.40.0,4.39.0
+      ,,
+      PRIMITIVES:,,
+      :doc:`hipCUB <hipcub:index>`,3.1.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.2.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.1.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.0.1,3.0.0
+      ,,
+      SUPPORT LIBS:,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.1.40091,6.0.32830
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.12.0,0.11.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.1.0,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,20240125.3.30,20231016.2.245
+      ,,
+      TOOLS:,,
+      :doc:`AMD SMI <amdsmi:index>`,24.4.1,23.4.2
+      :doc:`HIPIFY <hipify:index>`,17.0.0,17.0.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.71.0,0.71.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60100,2.0.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.3.0,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60100,4.1.0
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,14.1.0,13.2.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.0.0,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,rocm-6.1.0,rocm-6.0.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.3,2.0.3
+      :doc:`TransferBench <transferbench:index>`,1.48,1.46
+      ,,
+      COMPILERS:,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,0.5.0,0.5.0
+      `Flang <https://github.com/ROCm/flang>`_,17.0.0.24103,17.0.0.23483
+      `llvm-project <https://github.com/ROCm/llvm-project>`_,17.0.0.24103,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,17.0.0.24103,17.0.0.23483
+      ,,
+      RUNTIMES:,,
+      :doc:`HIP <hip:index>`,6.1.40091,6.0.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0
+      :doc:`ROCR-Runtime <rocr-runtime:index>`,1.13.0,1.12.0
+
+
+.. rubric:: Footnotes
+
+.. [#red-hat94] **For ROCm 6.1** - RHEL 9.4 is supported only on AMD Instinct MI300A.
+.. [#] **For ROCm 6.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+.. [#] **For ROCm 6.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9 and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+
+
--- a/docs/compatibility/precision-support.rst
+++ b/docs/compatibility/precision-support.rst
@@ -0,0 +1,565 @@
+.. meta::
+  :description: Supported data types in ROCm
+  :keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float,
+   float32, float64, double, AMD, ROCm, AMDGPU
+
+*************************************************************
+Precision support
+*************************************************************
+
+Use the following sections to identify data types and HIP types ROCm™ supports.
+
+Integral types
+==========================================
+
+The signed and unsigned integral types that are supported by ROCm are listed in the following table,
+together with their corresponding HIP type and a short description.
+
+
+.. list-table::
+    :header-rows: 1
+    :widths: 15,35,50
+
+    *
+      - Type name
+      - HIP type
+      - Description
+    *
+      - int8
+      - ``int8_t``, ``uint8_t``
+      - A signed or unsigned 8-bit integer
+    *
+      - int16
+      - ``int16_t``, ``uint16_t``
+      - A signed or unsigned 16-bit integer
+    *
+      - int32
+      - ``int32_t``, ``uint32_t``
+      - A signed or unsigned 32-bit integer
+    *
+      - int64
+      - ``int64_t``, ``uint64_t``
+      - A signed or unsigned 64-bit integer
+
+Floating-point types
+==========================================
+
+The floating-point types that are supported by ROCm are listed in the following table, together with
+their corresponding HIP type and a short description.
+
+.. image:: ../data/about/compatibility/floating-point-data-types.png
+    :alt: Supported floating-point types
+
+.. list-table::
+    :header-rows: 1
+    :widths: 15,15,70
+
+    *
+      - Type name
+      - HIP type
+      - Description
+    *
+      - float8 (E4M3)
+      - ``-``
+      - An 8-bit floating-point number that mostly follows IEEE-754 conventions and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ , with expanded range and with no infinity or signed zero. NaN is represented as negative zero.
+    *
+      - float8 (E5M2)
+      - ``-``
+      - An 8-bit floating-point number mostly following IEEE-754 conventions and **S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ , with expanded range and with no infinity or signed zero. NaN is represented as negative zero.
+    *
+      - float16
+      - ``half``
+      - A 16-bit floating-point number that conforms to the IEEE 754-2008 half-precision storage format.
+    *
+      - bfloat16
+      - ``bfloat16``
+      - A shortened 16-bit version of the IEEE 754 single-precision storage format.
+    *
+      - tensorfloat32
+      - ``-``
+      - A floating-point number that occupies 32 bits or less of storage, providing improved range compared to half (16-bit) format, at (potentially) greater throughput than single-precision (32-bit) formats.
+    *
+      - float32
+      - ``float``
+      - A 32-bit floating-point number that conforms to the IEEE 754 single-precision storage format.
+    *
+      - float64
+      - ``double``
+      - A 64-bit floating-point number that conforms to the IEEE 754 double-precision storage format.
+
+.. note::
+
+  * The float8 and tensorfloat32 types are internal types used in calculations in Matrix Cores and can be stored in any type of the same size.
+  * The encodings for FP8 (E5M2) and FP8 (E4M3) that are natively supported by MI300 differ from the FP8 (E5M2) and FP8 (E4M3) encodings used in H100 (`FP8 Formats for Deep Learning <https://arxiv.org/abs/2209.05433>`_).
+  * In some AMD documents and articles, float8 (E5M2) is referred to as bfloat8.
+
+ROCm support icons
+==========================================
+
+In the following sections, we use icons to represent the level of support. These icons, described in the
+following table, are also used on the library data type support pages.
+
+.. list-table::
+    :header-rows: 1
+
+    *
+      -  Icon
+      - Definition
+    *
+      - ❌
+      - Not supported
+
+    *
+      - ⚠️
+      - Partial support
+
+    *
+      - ✅
+      - Full support
+
+.. note::
+
+  * Full support means that the type is supported natively or with hardware emulation.
+  * Native support means that the operations for that type are implemented in hardware. Types that are not natively supported are emulated with the available hardware. The performance of non-natively supported types can differ from the full instruction throughput rate. For example, 16-bit integer operations can be performed on the 32-bit integer ALUs at full rate; however, 64-bit integer operations might need several instructions on the 32-bit integer ALUs.
+  * Any type can be emulated by software, but this page does not cover such cases.
+
+Hardware type support
+==========================================
+
+AMD GPU hardware support for data types is listed in the following tables.
+
+Compute units support
+-------------------------------------------------------------------------------
+
+The following table lists data type support for compute units.
+
+.. tab-set::
+
+  .. tab-item:: Integral types
+    :sync: integral-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - int8
+        - int16
+        - int32
+        - int64
+      *
+        - MI100
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - MI200 series
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - MI300 series
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+
+  .. tab-item:: Floating-point types
+    :sync: floating-point-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - float8 (E4M3)
+        - float8 (E5M2)
+        - float16
+        - bfloat16
+        - tensorfloat32
+        - float32
+        - float64
+      *
+        - MI100
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+      *
+        - MI200 series
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+      *
+        - MI300 series
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+
+Matrix core support
+-------------------------------------------------------------------------------
+
+The following table lists data type support for AMD GPU matrix cores.
+
+.. tab-set::
+
+  .. tab-item:: Integral types
+    :sync: integral-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - int8
+        - int16
+        - int32
+        - int64
+      *
+        - MI100
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+      *
+        - MI200 series
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+      *
+        - MI300 series
+        - ✅
+        - ❌
+        - ❌
+        - ❌
+
+  .. tab-item:: Floating-point types
+    :sync: floating-point-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - float8 (E4M3)
+        - float8 (E5M2)
+        - float16
+        - bfloat16
+        - tensorfloat32
+        - float32
+        - float64
+      *
+        - MI100
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ❌
+      *
+        - MI200 series
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+      *
+        - MI300 series
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+
+Atomic operations support
+-------------------------------------------------------------------------------
+
+The following table lists data type support for atomic operations.
+
+.. tab-set::
+
+  .. tab-item:: Integral types
+    :sync: integral-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - int8
+        - int16
+        - int32
+        - int64
+      *
+        - MI100
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+      *
+        - MI200 series
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+      *
+        - MI300 series
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+
+  .. tab-item:: Floating-point types
+    :sync: floating-point-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Type name
+        - float8 (E4M3)
+        - float8 (E5M2)
+        - float16
+        - bfloat16
+        - tensorfloat32
+        - float32
+        - float64
+      *
+        - MI100
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+      *
+        - MI200 series
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+      *
+        - MI300 series
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+
+.. note::
+
+  For cases that are not natively supported, you can emulate atomic operations using software.
+  Software-emulated atomic operations have high negative performance impact when they frequently
+  access the same memory address.
+
+Data Type support in ROCm Libraries
+==========================================
+
+ROCm library support for int8, float8 (E4M3), float8 (E5M2), int16, float16, bfloat16, int32,
+tensorfloat32, float32, int64, and float64 is listed in the following tables.
+
+Libraries input/output type support
+-------------------------------------------------------------------------------
+
+The following tables list ROCm library support for specific input and output data types. For a detailed
+description, refer to the corresponding library data type support page.
+
+.. tab-set::
+
+  .. tab-item:: Integral types
+    :sync: integral-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Library input/output data type name
+        - int8
+        - int16
+        - int32
+        - int64
+      *
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - ✅/✅
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+      *
+        - rocRAND (:doc:`details <rocrand:data-type-support>`)
+        - -/✅
+        - -/✅
+        - -/✅
+        - -/✅
+      *
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - -/✅
+        - -/✅
+        - -/✅
+        - -/✅
+      *
+        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+      *
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+      *
+        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+        - ✅/✅
+
+  .. tab-item:: Floating-point types
+    :sync: floating-point-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Library input/output data type name
+        - float8 (E4M3)
+        - float8 (E5M2)
+        - float16
+        - bfloat16
+        - tensorfloat32
+        - float32
+        - float64
+      *
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ❌/❌
+        - ❌/❌
+      *
+        - rocRAND (:doc:`details <rocrand:data-type-support>`)
+        - -/❌
+        - -/❌
+        - -/✅
+        - -/❌
+        - -/❌
+        - -/✅
+        - -/✅
+      *
+        - hipRAND (:doc:`details <hiprand:data-type-support>`)
+        - -/❌
+        - -/❌
+        - -/✅
+        - -/❌
+        - -/❌
+        - -/✅
+        - -/✅
+      *
+        - rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+      *
+        - hipCUB (:doc:`details <hipcub:data-type-support>`)
+        - ❌/❌
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+      *
+        - rocThrust (:doc:`details <rocthrust:data-type-support>`)
+        - ❌/❌
+        - ❌/❌
+        - ⚠️/⚠️
+        - ⚠️/⚠️
+        - ❌/❌
+        - ✅/✅
+        - ✅/✅
+
+
+Libraries internal calculations type support
+-------------------------------------------------------------------------------
+
+The following tables list ROCm library support for specific internal data types. For a detailed
+description, refer to the corresponding library data type support page.
+
+.. tab-set::
+
+  .. tab-item:: Integral types
+    :sync: integral-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Library internal data type name
+        - int8
+        - int16
+        - int32
+        - int64
+      *
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+
+
+  .. tab-item:: Floating-point types
+    :sync: floating-point-type
+
+    .. list-table::
+      :header-rows: 1
+
+      *
+        - Library internal data type name
+        - float8 (E4M3)
+        - float8 (E5M2)
+        - float16
+        - bfloat16
+        - tensorfloat32
+        - float32
+        - float64
+      *
+        - hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+        - ❌
--- a/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
+++ b/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
@@ -1,36 +1,61 @@
-===========================
-How ROCm uses PCIe atomics
-===========================
+.. meta::
+   :description: How ROCm uses PCIe atomics
+   :keywords: PCIe, PCIe atomics, atomics, BAR memory, AMD, ROCm

+*****************************************************************************
+How ROCm uses PCIe atomics
+*****************************************************************************

 ROCm PCIe feature and overview of BAR memory
-======================================================================
+================================================================

+ROCm is an extension of HSA platform architecture, so it shares the queuing model, memory model,
+signaling and synchronization protocols. Platform atomics are integral to perform queuing and
+signaling memory operations where there may be multiple-writers across CPU and GPU agents.

-ROCm is an extension of HSA platform architecture, so it shares the queueing model, memory model, signaling and synchronization protocols. Platform atomics are integral to perform queuing and signaling memory operations where there may be multiple-writers across CPU and GPU agents.
+The full list of HSA system architecture platform requirements are here:
+`HSA Sys Arch Features <http://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf>`_.

-The full list of HSA system architecture platform requirements are here: `HSA Sys Arch Features <http://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf>`_.
+AMD ROCm Software uses the new PCI Express 3.0 (Peripheral Component Interconnect Express [PCIe]
+3.0) features for atomic read-modify-write transactions which extends inter-processor synchronization
+mechanisms to IO to support the defined set of HSA capabilities needed for queuing and signaling
+memory operations.

-The ROCm Platform uses the new PCI Express 3.0 (PCIe 3.0) features for Atomic Read-Modify-Write Transactions which extends inter-processor synchronization mechanisms to IO to support the defined set of HSA capabilities needed for queuing and signaling memory operations.
-
-The new PCIe AtomicOps operate as completers for ``CAS`` (Compare and Swap), ``FetchADD``, ``SWAP`` atomics. The AtomicsOps are initiated by the
-I/O device which support 32-bit, 64-bit and 128-bit operand which target address have to be naturally aligned to operation sizes.
+The new PCIe atomic operations operate as completers for ``CAS`` (Compare and Swap), ``FetchADD``,
+``SWAP`` atomics. The atomic operations are initiated by the I/O device which support 32-bit, 64-bit and
+128-bit operand which target address have to be naturally aligned to operation sizes.

 For ROCm the Platform atomics are used in ROCm in the following ways:

-   * Update HSA queue’s read_dispatch_id: 64 bit atomic add used by the command processor on the GPU agent to update the packet ID it 	  processed.
-   * Update HSA queue’s write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions.
-   * Update HSA Signals – 64bit atomic ops are used for CPU & GPU synchronization.
+  * Update HSA queue's read_dispatch_id: 64 bit atomic add used by the command processor on the
+    GPU agent to update the packet ID it processed.
+  * Update HSA queue's write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to
+    support multi-writer queue insertions.
+  * Update HSA Signals -- 64bit atomic ops are used for CPU & GPU synchronization.

-The PCIe 3.0 AtomicOp feature allows atomic transactions to be requested by, routed through and completed by PCIe components. Routing and completion does not require software support. Component support for each is detectable via the DEVCAP2 register. Upstream bridges need to have AtomicOp routing enabled or the Atomic Operations will fail even though PCIe endpoint and PCIe I/O devices has the capability to Atomics Operations.
+The PCIe 3.0 atomic operations feature allows atomic transactions to be requested by, routed through
+and completed by PCIe components. Routing and completion does not require software support.
+Component support for each is detectable via the Device Capabilities 2 (DevCap2) register. Upstream
+bridges need to have atomic operations routing enabled or the atomic operations will fail even though
+PCIe endpoint and PCIe I/O devices has the capability to atomic operations.

-To do AtomicOp routing capability between two or more Root Ports, each associated Root Port must indicate that capability via the AtomicOp routing supported bit in the Device Capabilities 2 register.
+To do atomic operations routing capability between two or more Root Ports, each associated Root Port
+must indicate that capability via the atomic operations routing supported bit in the DevCap2 register.

-If your system has a PCIe Express Switch it needs to support AtomicsOp routing. AtomicOp requests are permitted only if a component’s ``DEVCTL2.ATOMICOP_REQUESTER_ENABLE`` field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported.
+If your system has a PCIe Express Switch it needs to support atomic operations routing. Atomic
+operations requests are permitted only if a component's ``DEVCTL2.ATOMICOP_REQUESTER_ENABLE``
+field is set. These requests can only be serviced if the upstream components support atomic operation
+completion and/or routing to a component which does. Atomic operations routing support=1, routing
+is supported; atomic operations routing support=0, routing is not supported.

-An atomic operation is a non-posted transaction supporting 32-bit and 64-bit address formats, there must be a response for Completion containing the result of the operation. Errors associated with the operation (uncorrectable error accessing the target location or carrying out the Atomic operation) are signaled to the requester by setting the Completion Status field in the completion descriptor, they are set to to Completer Abort (CA) or Unsupported Request (UR).
+An atomic operation is a non-posted transaction supporting 32-bit and 64-bit address formats, there
+must be a response for Completion containing the result of the operation. Errors associated with the
+operation (uncorrectable error accessing the target location or carrying out the atomic operation) are
+signaled to the requester by setting the Completion Status field in the completion descriptor, they are
+set to to Completer Abort (CA) or Unsupported Request (UR).

-To understand more about how PCIe atomic operations work, see `PCIe atomics <https://pcisig.com/specifications/pciexpress/specifications/ECN_Atomic_Ops_080417.pdf>`_
+To understand more about how PCIe atomic operations work, see
+`PCIe atomics <https://pcisig.com/specifications/pciexpress/specifications/ECN_Atomic_Ops_080417.pdf>`_

 `Linux Kernel Patch to pci_enable_atomic_request <https://patchwork.kernel.org/project/linux-pci/patch/1443110390-4080-1-git-send-email-jay@jcornwall.me/>`_

@@ -38,57 +63,60 @@ There are also a number of papers which talk about these new capabilities:

  * `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
  * `PCI express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
-  * `Intel PCIe Generation 3 Hotchips Paper <https://www.hotchips.org/wp-content/uploads/hc_archives/hc21/1_sun/HC21.23.1.SystemInterconnectTutorial-Epub/HC21.23.131.Ajanovic-Intel-PCIeGen3.pdf>`_
-  * `PCIe Generation 4 Base Specification includes Atomics Operation <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
+  * `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
+  * `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_

-Other I/O devices with PCIe atomics support
+Other I/O devices with PCIe atomics support:

-   * `Mellanox ConnectX-5 InfiniBand Card <http://www.mellanox.com/related-docs/prod_adapter_cards/PB_ConnectX-5_VPI_Card.pdf>`_
-   * `Cray Aries Interconnect <http://www.hoti.org/hoti20/slides/Bob_Alverson.pdf>`_
-   * `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
-   * `Xilinx 7 Series Devices <https://docs.xilinx.com/v/u/1nfXeFNnGpA0ywyykvWHWQ>`_
+  * Mellanox ConnectX-5 InfiniBand Card
+  * Cray Aries Interconnect
+  * Xilinx 7 Series Devices

 Future bus technology with richer I/O atomics operation Support

  * GenZ

-New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPU’s with PCIe Generation 3.0 support.
+New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPUs
+with PCIe Generation 3.0 support.

-  * `Mellanox Bluefield SOC <https://docs.nvidia.com/networking/display/BlueFieldSWv25111213/BlueField+Software+Overview>`_
-  * `Cavium Thunder X2 <https://en.wikichip.org/wiki/cavium/thunderx2>`_
+  * Mellanox Bluefield SOC
+  * Cavium Thunder X2

-In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets:
+In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU
+originates two writes to two different targets:

-  | 1. write to another GPU memory,
+* Write to another GPU memory
+* Write to system memory to indicate transfer complete

-  | 2. then write to system memory to indicate transfer complete.
-
-They are routed off to different ends of the computer but we want to make sure the write to system memory to indicate transfer complete occurs AFTER P2P write to GPU has complete.
+They are routed off to different ends of the computer but we want to make sure the write to system
+memory to indicate transfer complete occurs AFTER P2P write to GPU has complete.

 BAR memory overview
-***************************************************************************************************
-On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing, if so he need to set MMIO Base address ( MMIOH Base) and Range ( MMIO High Size) in the BIOS.
+----------------------------------------------------------------------------------------------------
+On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing, if so he need to set
+memory-mapped input/output (MMIO) base address (MMIOH base) and range (MMIO high size) in the BIOS.

-In SuperMicro system in the system bios you need to see the following
+In the Supermicro system in the system bios you need to see the following

-   * Advanced->PCIe/PCI/PnP configuration-> Above 4G Decoding = Enabled
+  * Advanced->PCIe/PCI/PnP configuration-\> Above 4G Decoding = Enabled
+  * Advanced->PCIe/PCI/PnP Configuration-\>MMIOH Base = 512G
+  * Advanced->PCIe/PCI/PnP Configuration-\>MMIO High Size = 256G

-   * Advanced->PCIe/PCI/PnP Configuration->MMIOH Base = 512G
-
-   * Advanced->PCIe/PCI/PnP Configuration->MMIO High Size = 256G
-
-When we support Large Bar Capability there is a Large Bar Vbios which also disable the IO bar.
+When we support Large Bar Capability there is a Large Bar VBIOS which also disable the IO bar.

 For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual address.

-   * BAR0-1 registers: 64bit, prefetchable, GPU memory. 8GB or 16GB depending on Vega10 SKU. Must be placed < 2^44 to support P2P  	access from other Vega10.
-   * BAR2-3 registers: 64bit, prefetchable, Doorbell. Must be placed < 2^44 to support P2P access from other Vega10.
-   * BAR4 register: Optional, not a boot device.
-   * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed < 4GB.
+  * BAR0-1 registers: 64bit, prefetchable, GPU memory. 8GB or 16GB depending on Vega10 SKU. Must
+    be placed < 2^44 to support P2P  	access from other Vega10.
+  * BAR2-3 registers: 64bit, prefetchable, Doorbell. Must be placed \< 2^44 to support P2P access from
+    other Vega10.
+  * BAR4 register: Optional, not a boot device.
+  * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed \< 4GB.

-Here is how our base address register (BAR) works on GFX 8 GPU’s with 40 bit Physical Address Limit ::
+Here is how our base address register (BAR) works on GFX 8 GPUs with 40 bit Physical Address Limit ::

-  11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO Series] (rev c1)
+  11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO
+  Series] (rev c1)

  Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35

@@ -106,40 +134,23 @@ Here is how our base address register (BAR) works on GFX 8 GPU’s with 40 bit P

 Legend:

-1 : GPU Frame Buffer BAR – In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs.
+1 : GPU Frame Buffer BAR -- In this example it happens to be 256M, but typically this will be size of the
+GPU memory (typically 4GB+). This BAR has to be placed \< 2^40 to allow peer-to-peer access from
+other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed \< 2^44 to allow peer-to-peer
+access from other GFX9 AMD GPUs.

-2 : Doorbell BAR – The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs.
+2 : Doorbell BAR -- The size of the BAR is typically will be \< 10MB (currently fixed at 2MB) for this
+generation GPUs. This BAR has to be placed \< 2^40 to allow peer-to-peer access from other current
+generation AMD GPUs.

-3 : IO BAR - This is for legacy VGA and boot device support, but since this the GPUs in this project are not VGA devices (headless), this is not a concern even if the SBIOS does not setup.
+3 : IO BAR -- This is for legacy VGA and boot device support, but since this the GPUs in this project are
+not VGA devices (headless), this is not a concern even if the SBIOS does not setup.

-4 : MMIO BAR – This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB.
+4 : MMIO BAR -- This is required for the AMD Driver SW to access the configuration registers. Since the
+reminder of the BAR available is only 1 DWORD (32bit), this is placed \< 4GB. This is fixed at 256KB.

-5 : Expansion ROM – This is required for the AMD Driver SW to access the GPU’s video-bios. This is currently fixed at 128KB.
+5 : Expansion ROM -- This is required for the AMD Driver SW to access the GPU video-bios. This is
+currently fixed at 128KB.

-Excerpts from 'Overview of Changes to PCI Express 3.0'
-================================================================
-By Mike Jackson, Senior Staff Architect, MindShare, Inc.
-***************************************************************************************************
-Atomic operations – goal:
-***************************************************************************************************
-Support SMP-type operations across a PCIe network to allow for things like offloading tasks between CPU cores and accelerators like a GPU. The spec says this enables advanced synchronization mechanisms that are particularly useful with multiple producers or consumers that need to be synchronized in a non-blocking fashion. Three new atomic non-posted requests were added, plus the corresponding completion (the address must be naturally aligned with the operand size or the TLP is malformed):
-
-  * Fetch and Add – uses one operand as the “add” value. Reads the target location, adds the operand, and then writes the result back 	  to the original location.
-
-  * Unconditional Swap – uses one operand as the “swap” value. Reads the target location and then writes the swap value to it.
-
-  * Compare and Swap – uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it     	against the compare value and, if equal, writes the swap value to the target location.
-
-  * AtomicOpCompletion – new completion to give the result so far atomic request and indicate that the atomicity of the transaction 	has been maintained.
-
-Since atomic operations are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide “lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic.” The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation.
-
-Atomic operations can go from device to device, device to host, or host to device. Each completer indicates whether it supports this capability and guarantees atomic access if it does. The ability to route atomic operations is also indicated in the registers for a given port.
-
-ID-based ordering – goal:
-***************************************************************************************************
-Improve performance by avoiding stalls caused by ordering rules. For example, posted writes are never normally allowed to pass each other in a queue, but if they are requested by different functions, we can have some confidence that the requests are not dependent on each other. The previously reserved Attribute bit [2] is now combined with the RO bit to indicate ID ordering with or without relaxed ordering.
-
-This only has meaning for memory requests, and is reserved for Configuration or IO requests. Completers are not required to copy this bit into a completion, and only use the bit if their enable bit is set for this operation.
-
-To read more on PCIe Gen 3 new options https://www.mindshare.com/files/resources/PCIe%203-0.pdf
+For more information, you can review
+`Overview of Changes to PCI Express 3.0 <https://www.mindshare.com/files/resources/PCIe%203-0.pdf>`_.
--- a/docs/conceptual/ai-migraphx-optimization.md
+++ b/docs/conceptual/ai-migraphx-optimization.md
@@ -1,3 +1,10 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Inference optimization with MIGraphX">
+  <meta name="keywords" content="Inference optimization, MIGraphX, deep-learning, MIGraphX
+  installation, AMD, ROCm">
+</head>
+
 # Inference optimization with MIGraphX

 The following sections cover inferencing and introduces [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/).
@@ -48,15 +55,15 @@ The header files and libraries are installed under `/opt/rocm-\<version\>`, wher

 There are two ways to build the MIGraphX sources.

-* [Use the ROCm build tool](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses `[rbuild](https://github.com/RadeonOpenCompute/rbuild)` to install the prerequisites and build the libraries with just one command.
+* [Use the ROCm build tool](https://github.com/ROCm/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses `[rbuild](https://github.com/ROCm/rbuild)` to install the prerequisites and build the libraries with just one command.

  or

-* [Use CMake](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.
+* [Use CMake](https://github.com/ROCm/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.

 For detailed steps on building from source and installing dependencies, refer to the following `README` file:

-[https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source)
+[https://github.com/ROCm/AMDMIGraphX#building-from-source](https://github.com/ROCm/AMDMIGraphX#building-from-source)

 ### Option 3: use docker

@@ -65,7 +72,7 @@ To use Docker, follow these steps:
 1. The easiest way to set up the development environment is to use Docker. To build Docker from scratch, first clone the MIGraphX repository by running:

    ```bash
-    git clone --recursive https://github.com/ROCmSoftwarePlatform/AMDMIGraphX
+    git clone --recursive https://github.com/ROCm/AMDMIGraphX
    ```

 2. The repository contains a Dockerfile from which you can build a Docker image as:
@@ -209,23 +216,23 @@ Follow these steps:
    ./inception_inference
    ```

-```{note}
+:::{note}
    Set `LD_LIBRARY_PATH` to `/opt/rocm/lib` if required during the build. Additional examples can be found in the MIGraphX repository under the `/examples/` directory.
-```
+:::

 ## Tuning MIGraphX

 MIGraphX uses MIOpen kernels to target AMD GPU. For the model compiled with MIGraphX, tune MIOpen to pick the best possible kernel implementation. The MIOpen tuning results in a significant performance boost. Tuning can be done by setting the environment variable `MIOPEN_FIND_ENFORCE=3`.

-```{note}
+:::{note}
    The tuning process can take a long time to finish.
-```
+:::

 **Example:** The average inference time of the inception model example shown previously over 100 iterations using untuned kernels is 0.01383ms. After tuning, it reduces to 0.00459ms, which is a 3x improvement. This result is from ROCm v4.5 on a MI100 GPU.

-```{note}
+:::{note}
    The results may vary depending on the system configurations.
-```
+:::

 For reference, the following code snippet shows inference runs for only the first 10 iterations for both tuned and untuned kernels:

--- a/docs/conceptual/ai-pytorch-inception.md
+++ b/docs/conceptual/ai-pytorch-inception.md
@@ -1,3 +1,10 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Inception V3 with PyTorch">
+  <meta name="keywords" content="PyTorch, Inception V3, deep-learning, training data, optimization
+  algorithm, AMD, ROCm">
+</head>
+
 # Deep learning: Inception V3 with PyTorch

 ## Deep learning training
@@ -15,6 +22,7 @@ Training occurs in multiple phases for every batch of training data. the followi
 :::{table} Types of Training Phases
 :name: training-phases
 :widths: auto
+
 | Types of Phases   |     |
 | ----------------- | --- |
 | Forward Pass      | The input features are fed into the model, whose parameters may be randomly initialized initially. Activations (outputs) of each layer are retained during this pass to help in the loss gradient computation during the backward pass. |
@@ -28,6 +36,7 @@ Training is different from inference, particularly from the hardware perspective
 :::{table} Training vs. Inference
 :name: training-inference
 :widths: auto
+
 | Training | Inference |
 | ----------- | ----------- |
 | Training is measured in hours/days. | The inference is measured in minutes. |
@@ -36,7 +45,7 @@ Training is different from inference, particularly from the hardware perspective
 | Data for training is available on the disk before the training process and is generally significant. The training performance is measured by how fast the data batches can be processed. | Inference data usually arrive stochastically, which may be batched to improve performance. Inference performance is generally measured in throughput speed to process the batch of data and the delay in responding to the input (latency). |
 :::

-Different quantization data types are typically chosen between training (FP32, BF16) and inference (FP16, INT8). The computation hardware has different specializations from other datatypes, leading to improvement in performance if a faster datatype can be selected for the corresponding task.
+Different quantization data types are typically chosen between training (FP32, BF16) and inference (FP16, INT8). The computation hardware has different specializations from other data types, leading to improvement in performance if a faster datatype can be selected for the corresponding task.

 ## Case studies

@@ -56,7 +65,7 @@ This example is adapted from the PyTorch research hub page on [Inception V3](htt

 Follow these steps:

-1. Run the PyTorch ROCm-based Docker image or refer to the section [Installing PyTorch](../install/pytorch-install.md) for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker run -it -v $HOME:/data --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest
@@ -146,7 +155,7 @@ The previous section focused on downloading and using the Inception V3 model for

 Follow these steps:

-1. Run the PyTorch ROCm Docker image or refer to the section [Installing PyTorch](../install/pytorch-install.md) for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker pull rocm/pytorch:latest
@@ -208,9 +217,9 @@ Follow these steps:

 7. Set parameters to guide the training process.

-    ```{note}
+    :::{note}
    The device is set to `"cuda"`. In PyTorch, `"cuda"` is a generic keyword to denote a GPU.
-    ```
+    :::

    ```py
    device = "cuda"
@@ -270,9 +279,9 @@ Follow these steps:
    lr_gamma = 0.1
    ```

-    ```{note}
+    :::{note}
    One training epoch is when the neural network passes an entire dataset forward and backward.
-    ```
+    :::

    ```py
    epochs = 90
@@ -333,9 +342,9 @@ Follow these steps:
    )
    ```

-    ```{note}
+    :::{note}
    Use torchvision to obtain the Inception V3 model. Use the pre-trained model weights to speed up training.
-    ```
+    :::

    ```py
    print("Creating model")
@@ -672,7 +681,7 @@ The dataset has 60,000 images you will use to train the network and 10,000 to ev

 Access the source code from the following repository:

-[https://github.com/ROCmSoftwarePlatform/tensorflow_fashionmnist/blob/main/fashion_mnist.py](https://github.com/ROCmSoftwarePlatform/tensorflow_fashionmnist/blob/main/fashion_mnist.py)
+[https://github.com/ROCm/tensorflow_fashionmnist/blob/main/fashion_mnist.py](https://github.com/ROCm/tensorflow_fashionmnist/blob/main/fashion_mnist.py)

 To understand the code step by step, follow these steps:

@@ -869,7 +878,7 @@ To understand the code step by step, follow these steps:
        thisplot[true_label].set_color('blue')
        ```

-    9. With the model trained, you can use it to make predictions about some images. Review the 0-th image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.
+    9. With the model trained, you can use it to make predictions about some images. Review the 0<sup>th</sup> image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.

        ```py
        i = 0
@@ -1155,9 +1164,10 @@ To prepare the data for training, follow these steps:
    print("Accuracy: ", accuracy)
    ```

-    ```{note}
-    model.fit() returns a History object that contains a dictionary with everything that happened during training.
-    ```
+    :::{note}
+    `model.fit()` returns a History object that contains a dictionary with everything that happened during
+    training.
+    :::

    ```py
    history_dict = history.history
--- a/docs/conceptual/cmake-packages.rst
+++ b/docs/conceptual/cmake-packages.rst
@@ -1,34 +1,40 @@
-***********
+.. meta::
+   :description: Using CMake
+   :keywords: CMake, dependencies, HIP, C++, AMD, ROCm
+
+*********************************
 Using CMake
-***********
+*********************************

 Most components in ROCm support CMake. Projects depending on header-only or
 library components typically require CMake 3.5 or higher whereas those wanting
-to make use of CMake's HIP language support will require CMake 3.21 or higher.
+to make use of the CMake HIP language support will require CMake 3.21 or higher.

 Finding dependencies
 ====================

 .. note::
-   For a complete
-   reference on how to deal with dependencies in CMake, refer to the CMake docs
-   on `find_package
-   <https://cmake.org/cmake/help/latest/command/find_package.html>`_ and the
-   `Using Dependencies Guide
-   <https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html>`_
-   to get an overview of CMake's related facilities.
+
+  For a complete
+  reference on how to deal with dependencies in CMake, refer to the CMake docs
+  on `find_package
+  <https://cmake.org/cmake/help/latest/command/find_package.html>`_ and the
+  `Using Dependencies Guide
+  <https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html>`_
+  to get an overview of CMake related facilities.

 In short, CMake supports finding dependencies in two ways:

-*  In Module mode, it consults a file ``Find<PackageName>.cmake`` which tries to
-   find the component in typical install locations and layouts. CMake ships a
-   few dozen such scripts, but users and projects may ship them as well.
-*  In Config mode, it locates a file named ``<packagename>-config.cmake`` or
-   ``<PackageName>Config.cmake`` which describes the installed component in all
-   regards needed to consume it.
+* In Module mode, it consults a file ``Find<PackageName>.cmake`` which tries to find the component
+  in typical install locations and layouts. CMake ships a few dozen such scripts, but users and projects
+  may ship them as well.
+
+* In Config mode, it locates a file named ``<packagename>-config.cmake`` or
+  ``<PackageName>Config.cmake`` which describes the installed component in all regards needed to
+  consume it.

 ROCm predominantly relies on Config mode, one notable exception being the Module
-driving the compilation of HIP programs on Nvidia runtimes. As such, when
+driving the compilation of HIP programs on NVIDIA runtimes. As such, when
 dependencies are not found in standard system locations, one either has to
 instruct CMake to search for package config files in additional folders using
 the ``CMAKE_PREFIX_PATH`` variable (a semi-colon separated list of file system
@@ -40,9 +46,9 @@ it to your CMake configuration command on the command line via
 ``-D CMAKE_PREFIX_PATH=....`` . AMD packaged ROCm installs can typically be
 added to the config file search paths such as:

-  Windows: ``-D CMAKE_PREFIX_PATH=${env:HIP_PATH}``
+*  Windows: ``-D CMAKE_PREFIX_PATH=${env:HIP_PATH}``

-  Linux: ``-D CMAKE_PREFIX_PATH=/opt/rocm``
+*  Linux: ``-D CMAKE_PREFIX_PATH=/opt/rocm``

 ROCm provides the respective *config-file* packages, and this enables
 ``find_package`` to be used directly. ROCm does not require any Find module as
@@ -50,14 +56,16 @@ the *config-file* packages are shipped with the upstream projects, such as
 rocPRIM and other ROCm libraries.

 For a complete guide on where and how ROCm may be installed on a system, refer
-to the installation guides for `Linux <../install/linux/install.html>`_ and
-`Windows <../install/windows/install.html>`_.
+to the installation guides for
+`Linux <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html>`_
+and
+`Windows <https://rocm.docs.amd.com/projects/install-on-windows/en/latest/index.html>`_.

 Using HIP in CMake
 ==================

 ROCm components providing a C/C++ interface support consumption via any
-C/C++ toolchain that CMake knows how to drive. ROCm also supports CMake's HIP
+C/C++ toolchain that CMake knows how to drive. ROCm also supports the CMake HIP
 language features, allowing users to program using the HIP single-source
 programming model. When a program (or translation-unit) uses the HIP API without
 compiling any GPU device code, HIP can be treated in CMake as a simple C/C++
@@ -70,22 +78,22 @@ Source code written in the HIP dialect of C++ typically uses the `.hip`
 extension. When the HIP CMake language is enabled, it will automatically
 associate such source files with the HIP toolchain being used.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
-    cmake_policy(VERSION 3.21.3...3.27)
-    project(MyProj LANGUAGES HIP)
-    add_executable(MyApp Main.hip)
+  cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
+  cmake_policy(VERSION 3.21.3...3.27)
+  project(MyProj LANGUAGES HIP)
+  add_executable(MyApp Main.hip)

 Should you have existing CUDA code that is from the source compatible subset of
 HIP, you can tell CMake that despite their `.cu` extension, they're HIP sources.
 Do note that this mostly facilitates compiling kernel code-only source files,
 as host-side CUDA API won't compile in this fashion.

-::
+.. code-block:: cmake

-    add_library(MyLib MyLib.cu)
-    set_source_files_properties(MyLib.cu PROPERTIES LANGUAGE HIP)
+  add_library(MyLib MyLib.cu)
+  set_source_files_properties(MyLib.cu PROPERTIES LANGUAGE HIP)

 CMake itself only hosts part of the HIP language support, such as defining
 HIP-specific properties, etc. while the other half ships with the HIP
@@ -97,6 +105,10 @@ there's a catch-all, last resort variable consulted locating this file,
 ``-D CMAKE_HIP_COMPILER_ROCM_ROOT:PATH=`` which should be set the root of the
 ROCm installation.

+.. note::
+    Imported targets defined by `hip-lang-config.cmake` are for internal use
+    only.
+
 If the user doesn't provide a semi-colon delimited list of device architectures
 via ``CMAKE_HIP_ARCHITECTURES``, CMake will select some sensible default. It is
 advised though that if a user knows what devices they wish to target, then set
@@ -110,45 +122,57 @@ Illustrated in the example below is a C++ application using MIOpen from CMake.
 It calls ``find_package(miopen)``, which provides the ``MIOpen`` imported
 target. This can be linked with ``target_link_libraries``

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.5) # find_package(miopen) requires 3.5
-    cmake_policy(VERSION 3.5...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(miopen)
-    add_library(MyLib ...)
-    target_link_libraries(MyLib PUBLIC MIOpen)
+  cmake_minimum_required(VERSION 3.5) # find_package(miopen) requires 3.5
+  cmake_policy(VERSION 3.5...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(miopen)
+  add_library(MyLib ...)
+  target_link_libraries(MyLib PUBLIC MIOpen)

 .. note::
-    Most libraries are designed as host-only API, so using a GPU device
-    compiler is not necessary for downstream projects unless they use GPU device
-    code.
+
+  Most libraries are designed as host-only API, so using a GPU device
+  compiler is not necessary for downstream projects unless they use GPU device
+  code.

 Consuming the HIP API in C++ code
 ---------------------------------

-Use the HIP API without compiling the GPU device code. As there is no GPU code,
-any C or C++ compiler can be used. The ``find_package(hip)`` provides the
-``hip::host`` imported target to use HIP in this context.
+Consuming the HIP API without compiling single-source GPU device code can be
+done using any C++ compiler. The ``find_package(hip)`` provides the
+``hip::host`` imported target to use HIP in this scenario.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.5) # find_package(hip) requires 3.5
-    cmake_policy(VERSION 3.5...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(hip REQUIRED)
-    add_executable(MyApp ...)
-    target_link_libraries(MyApp PRIVATE hip::host)
+  cmake_minimum_required(VERSION 3.5) # find_package(hip) requires 3.5
+  cmake_policy(VERSION 3.5...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(hip REQUIRED)
+  add_executable(MyApp ...)
+  target_link_libraries(MyApp PRIVATE hip::host)
+
+When mixing such ``CXX`` sources with ``HIP`` sources holding device-code, link
+only to `hip::host`. If HIP sources don't have `.hip` as their extension, use
+`set_source_files_properties(<hip_sources>... PROPERTIES LANGUAGE HIP)` on them.
+Linking to `hip::host` will set all the necessary flags for the ``CXX`` sources
+while ``HIP`` sources inherit all flags from the built-in language support.
+Having HIP sources in a target will turn the |LINK_LANG|_ into ``HIP``.
+
+.. |LINK_LANG| replace:: ``LINKER_LANGUAGE``
+.. _LINK_LANG: https://cmake.org/cmake/help/latest/prop_tgt/LINKER_LANGUAGE.html

 Compiling device code in C++ language mode
 ------------------------------------------

 .. attention::
-    The workflow detailed here is considered legacy and is shown for
-    understanding's sake. It pre-dates the existence of HIP language support in
-    CMake. If source code has HIP device code in it, it is a HIP source file
-    and should be compiled as such. Only resort to the method below if your
-    HIP-enabled CMake codepath can't mandate CMake version 3.21.
+
+  The workflow detailed here is considered legacy and is shown for
+  understanding's sake. It pre-dates the existence of HIP language support in
+  CMake. If source code has HIP device code in it, it is a HIP source file
+  and should be compiled as such. Only resort to the method below if your
+  HIP-enabled CMake code path can't mandate CMake version 3.21.

 If code uses the HIP API and compiles GPU device code, it requires using a
 device compiler. The compiler for CMake can be set using either the
@@ -160,34 +184,34 @@ compiler that supports AMD GPU targets, which is usually Clang.
 The ``find_package(hip)`` provides the ``hip::device`` imported target to add
 all the flags necessary for device compilation.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.8) # cxx_std_11 requires 3.8
-    cmake_policy(VERSION 3.8...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(hip REQUIRED)
-    add_library(MyLib ...)
-    target_link_libraries(MyLib PRIVATE hip::device)
-    target_compile_features(MyLib PRIVATE cxx_std_11)
+  cmake_minimum_required(VERSION 3.8) # cxx_std_11 requires 3.8
+  cmake_policy(VERSION 3.8...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(hip REQUIRED)
+  add_library(MyLib ...)
+  target_link_libraries(MyLib PRIVATE hip::device)
+  target_compile_features(MyLib PRIVATE cxx_std_11)

 .. note::
-    Compiling for the GPU device requires at least C++11.

-This project can then be configured with for eg.
+  Compiling for the GPU device requires at least C++11.

-  Windows: ``cmake -D CMAKE_CXX_COMPILER:PATH=${env:HIP_PATH}\bin\clang++.exe``
+This project can then be configured with the following CMake commands:

-  Linux: ``cmake -D CMAKE_CXX_COMPILER:PATH=/opt/rocm/bin/amdclang++``
+*  Windows: ``cmake -D CMAKE_CXX_COMPILER:PATH=${env:HIP_PATH}\bin\clang++.exe``
+*  Linux: ``cmake -D CMAKE_CXX_COMPILER:PATH=/opt/rocm/bin/amdclang++``

 Which use the device compiler provided from the binary packages of
-`ROCm HIP SDK <https://www.amd.com/en/developer/rocm-hub.html>`_ and
+`ROCm HIP SDK <https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html>`_ and
 `repo.radeon.com <https://repo.radeon.com>`_ respectively.

-When using the CXX language support to compile HIP device code, selecting the
+When using the ``CXX`` language support to compile HIP device code, selecting the
 target GPU architectures is done via setting the ``GPU_TARGETS`` variable.
 ``CMAKE_HIP_ARCHITECTURES`` only exists when the HIP language is enabled. By
 default, this is set to some subset of the currently supported architectures of
-AMD ROCm. It can be set to eg. ``-D GPU_TARGETS="gfx1032;gfx1035"``.
+AMD ROCm. It can be set to the CMake option ``-D GPU_TARGETS="gfx1032;gfx1035"``.

 ROCm CMake packages
 -------------------
@@ -252,13 +276,12 @@ options.

 IDEs supporting CMake (Visual Studio, Visual Studio Code, CLion, etc.) all came
 up with their own way to register command-line fragments of different purpose in
-a setup'n'forget fashion for quick assembly using graphical front-ends. This is
+a setup-and-forget fashion for quick assembly using graphical front-ends. This is
 all nice, but configurations aren't portable, nor can they be reused in
-Continuous Intergration (CI) pipelines. CMake has condensed existing practice
+Continuous Integration (CI) pipelines. CMake has condensed existing practice
 into a portable JSON format that works in all IDEs and can be invoked from any
 command line. This is
-`CMake Presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_
-.
+`CMake Presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_.

 There are two types of preset files: one supplied by the project, called
 ``CMakePresets.json`` which is meant to be committed to version control,
@@ -275,109 +298,110 @@ Following is an example ``CMakeUserPresets.json`` file which actually compiles
 the `amd/rocm-examples <https://github.com/amd/rocm-examples>`_ suite of sample
 applications on a typical ROCm installation:

-::
+.. code-block:: json

-    {
-      "version": 3,
-      "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 21,
-        "patch": 0
+  {
+    "version": 3,
+    "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 21,
+      "patch": 0
+    },
+    "configurePresets": [
+      {
+        "name": "layout",
+        "hidden": true,
+        "binaryDir": "${sourceDir}/build/${presetName}",
+        "installDir": "${sourceDir}/install/${presetName}"
      },
-      "configurePresets": [
-        {
-          "name": "layout",
-          "hidden": true,
-          "binaryDir": "${sourceDir}/build/${presetName}",
-          "installDir": "${sourceDir}/install/${presetName}"
-        },
-        {
-          "name": "generator-ninja-multi-config",
-          "hidden": true,
-          "generator": "Ninja Multi-Config"
-        },
-        {
-          "name": "toolchain-makefiles-c/c++-amdclang",
-          "hidden": true,
-          "cacheVariables": {
-            "CMAKE_C_COMPILER": "/opt/rocm/bin/amdclang",
-            "CMAKE_CXX_COMPILER": "/opt/rocm/bin/amdclang++",
-            "CMAKE_HIP_COMPILER": "/opt/rocm/bin/amdclang++"
-          }
-        },
-        {
-          "name": "clang-strict-iso-high-warn",
-          "hidden": true,
-          "cacheVariables": {
-            "CMAKE_C_FLAGS": "-Wall -Wextra -pedantic",
-            "CMAKE_CXX_FLAGS": "-Wall -Wextra -pedantic",
-            "CMAKE_HIP_FLAGS": "-Wall -Wextra -pedantic"
-          }
-        },
-        {
-          "name": "ninja-mc-rocm",
-          "displayName": "Ninja Multi-Config ROCm",
-          "inherits": [
-            "layout",
-            "generator-ninja-multi-config",
-            "toolchain-makefiles-c/c++-amdclang",
-            "clang-strict-iso-high-warn"
-          ]
+      {
+        "name": "generator-ninja-multi-config",
+        "hidden": true,
+        "generator": "Ninja Multi-Config"
+      },
+      {
+        "name": "toolchain-makefiles-c/c++-amdclang",
+        "hidden": true,
+        "cacheVariables": {
+          "CMAKE_C_COMPILER": "/opt/rocm/bin/amdclang",
+          "CMAKE_CXX_COMPILER": "/opt/rocm/bin/amdclang++",
+          "CMAKE_HIP_COMPILER": "/opt/rocm/bin/amdclang++"
        }
-      ],
-      "buildPresets": [
-        {
-          "name": "ninja-mc-rocm-debug",
-          "displayName": "Debug",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm"
-        },
-        {
-          "name": "ninja-mc-rocm-release",
-          "displayName": "Release",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm"
-        },
-        {
-          "name": "ninja-mc-rocm-debug-verbose",
-          "displayName": "Debug (verbose)",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm",
-          "verbose": true
-        },
-        {
-          "name": "ninja-mc-rocm-release-verbose",
-          "displayName": "Release (verbose)",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm",
-          "verbose": true
+      },
+      {
+        "name": "clang-strict-iso-high-warn",
+        "hidden": true,
+        "cacheVariables": {
+          "CMAKE_C_FLAGS": "-Wall -Wextra -pedantic",
+          "CMAKE_CXX_FLAGS": "-Wall -Wextra -pedantic",
+          "CMAKE_HIP_FLAGS": "-Wall -Wextra -pedantic"
        }
-      ],
-      "testPresets": [
-        {
-          "name": "ninja-mc-rocm-debug",
-          "displayName": "Debug",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm",
-          "execution": {
-            "jobs": 0
-          }
-        },
-        {
-          "name": "ninja-mc-rocm-release",
-          "displayName": "Release",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm",
-          "execution": {
-            "jobs": 0
-          }
+      },
+      {
+        "name": "ninja-mc-rocm",
+        "displayName": "Ninja Multi-Config ROCm",
+        "inherits": [
+          "layout",
+          "generator-ninja-multi-config",
+          "toolchain-makefiles-c/c++-amdclang",
+          "clang-strict-iso-high-warn"
+        ]
+      }
+    ],
+    "buildPresets": [
+      {
+        "name": "ninja-mc-rocm-debug",
+        "displayName": "Debug",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm"
+      },
+      {
+        "name": "ninja-mc-rocm-release",
+        "displayName": "Release",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm"
+      },
+      {
+        "name": "ninja-mc-rocm-debug-verbose",
+        "displayName": "Debug (verbose)",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm",
+        "verbose": true
+      },
+      {
+        "name": "ninja-mc-rocm-release-verbose",
+        "displayName": "Release (verbose)",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm",
+        "verbose": true
+      }
+    ],
+    "testPresets": [
+      {
+        "name": "ninja-mc-rocm-debug",
+        "displayName": "Debug",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm",
+        "execution": {
+          "jobs": 0
        }
-      ]
-    }
+      },
+      {
+        "name": "ninja-mc-rocm-release",
+        "displayName": "Release",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm",
+        "execution": {
+          "jobs": 0
+        }
+      }
+    ]
+  }

 .. note::
-    Getting presets to work reliably on Windows requires some CMake improvements
-    and/or support from compiler vendors. (Refer to
-    `Add support to the Visual Studio generators <https://gitlab.kitware.com/cmake/cmake/-/issues/24245>`_
-    and `Sourcing environment scripts <https://gitlab.kitware.com/cmake/cmake/-/issues/21619>`_
-    .)
+
+  Getting presets to work reliably on Windows requires some CMake improvements
+  and/or support from compiler vendors. (Refer to
+  `Add support to the Visual Studio generators <https://gitlab.kitware.com/cmake/cmake/-/issues/24245>`_
+  and `Sourcing environment scripts <https://gitlab.kitware.com/cmake/cmake/-/issues/21619>`_
+  .)
--- a/docs/conceptual/compiler-disambiguation.md
+++ b/docs/conceptual/compiler-disambiguation.md
@@ -1,3 +1,9 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm compilers disambiguation">
+  <meta name="keywords" content="compilers, compiler naming, AMD, ROCm">
+</head>
+
 # ROCm compilers disambiguation

 ROCm ships multiple compilers of varying origins and purposes. This article
@@ -7,9 +13,9 @@ disambiguates compiler naming used throughout the documentation.

 | Term | Description |
 | - | - |
-| `amdclang++` | Clang/LLVM-based compiler that is part of `rocm-llvm` package. The source code is available at <a href="https://github.com/RadeonOpenCompute/llvm-project" target="_blank">https://github.com/RadeonOpenCompute/llvm-project</a>. |
+| `amdclang++` | Clang/LLVM-based compiler that is part of `rocm-llvm` package. The source code is available at <a href="https://github.com/ROCm/llvm-project" target="_blank">https://github.com/ROCm/llvm-project</a>. |
 | AOCC | Closed-source clang-based compiler that includes additional CPU optimizations. Offered as part of ROCm via the `rocm-llvm-alt` package. See for details, <a href="https://developer.amd.com/amd-aocc/" target="_blank">https://developer.amd.com/amd-aocc/</a>. |
 | HIP-Clang | Informal term for the `amdclang++` compiler |
-| HIPIFY | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm-Developer-Tools/HIPIFY" target="_blank">https://github.com/ROCm-Developer-Tools/HIPIFY</a> |
-| `hipcc` | HIP compiler driver. A utility that invokes `clang` or `nvcc` depending on the target and passes the appropriate include and library options for the target compiler and HIP infrastructure. The source code is available at <a href="https://github.com/ROCm-Developer-Tools/HIPCC" target="_blank">https://github.com/ROCm-Developer-Tools/HIPCC</a>. |
+| HIPIFY | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm/HIPIFY" target="_blank">https://github.com/ROCm/HIPIFY</a> |
+| `hipcc` | HIP compiler driver. A utility that invokes `clang` or `nvcc` depending on the target and passes the appropriate include and library options for the target compiler and HIP infrastructure. The source code is available at <a href="https://github.com/ROCm/HIPCC" target="_blank">https://github.com/ROCm/HIPCC</a>. |
 | ROCmCC | Clang/LLVM-based compiler. ROCmCC in itself is not a binary but refers to the overall compiler. |
--- a/docs/conceptual/compiler-topics.md
+++ b/docs/conceptual/compiler-topics.md
@@ -0,0 +1,14 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD ROCm documentation">
+  <meta name="keywords" content="documentation, guides, installation, compatibility, support,
+  reference, ROCm, AMD">
+</head>
+
+# Using compiler features
+
+The following topics describe using specific features of the compilation tools:
+
+* [Using AddressSanitizer](./using-gpu-sanitizer.md)
+* [Compiler disambiguation](./compiler-disambiguation.md)
+* [OpenMP support in ROCm](../about/compatibility/openmp.md)
--- a/docs/conceptual/file-reorg.md
+++ b/docs/conceptual/file-reorg.md
@@ -1,8 +1,15 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm Linux Filesystem Hierarchy Standard reorganization">
+  <meta name="keywords" content="FHS, Linux Filesystem Hierarchy Standard, directory structure,
+  AMD, ROCm">
+</head>
+
 # ROCm Linux Filesystem Hierarchy Standard reorganization

 ## Introduction

-The ROCm platform has adopted the Linux Filesystem Hierarchy Standard (FHS) [https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html) in order to to ensure ROCm is consistent with standard open source conventions. The following sections specify how current and future releases of ROCm adhere to FHS, how the previous ROCm file system is supported, and how improved versioning specifications are applied to ROCm.
+The ROCm Software has adopted the Linux Filesystem Hierarchy Standard (FHS) [https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html) in order to to ensure ROCm is consistent with standard open source conventions. The following sections specify how current and future releases of ROCm adhere to FHS, how the previous ROCm file system is supported, and how improved versioning specifications are applied to ROCm.

 ## Adopting the FHS

@@ -152,7 +159,7 @@ correct header file and use correct search paths.

 ## Changes in versioning specifications

-In order to better manage ROCm dependencies specification and allow smoother releases of ROCm while avoiding dependency conflicts, the ROCm platform shall adhere to the following scheme when numbering and incrementing ROCm files versions:
+In order to better manage ROCm dependencies specification and allow smoother releases of ROCm while avoiding dependency conflicts, ROCm software shall adhere to the following scheme when numbering and incrementing ROCm files versions:

 rocm-\<ver\>, where \<ver\> = \<x.y.z\>

--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -1,26 +1,47 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU architecture">
+  <meta name="keywords" content="GPU architecture, architecture support, MI200, MI250, RDNA,
+  MI100, AMD Instinct">
+</head>
+
+(gpu-arch-documentation)=
+
 # GPU architecture documentation

 :::::{grid} 1 1 2 2
 :gutter: 1

+:::{grid-item-card}
+**AMD Instinct MI300 series**
+
+Review hardware aspects of the AMD Instinct™ MI300 series of GPU accelerators and the CDNA™ 3
+architecture.
+
+* [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
+* [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
+* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
+* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
+:::
+
 :::{grid-item-card}
 **AMD Instinct MI200 series**

-Review hardware aspects of the AMD Instinct™ MI200 series of GPU
-accelerators and the CDNA™ 2 architecture.
+Review hardware aspects of the AMD Instinct™ MI200 series of GPU accelerators and the CDNA™ 2
+architecture.

 * [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
 * [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
 * [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
-* [Performance counters](./gpu-arch/mi200-performance-counters.md)
+* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)

 :::

 :::{grid-item-card}
 **AMD Instinct MI100**

-Review hardware aspects of the AMD Instinct™ MI100
-accelerators and the CDNA™ 1 architecture that is the foundation of these GPUs.
+Review hardware aspects of the AMD Instinct™ MI100 series of GPU accelerators and the CDNA™ 1
+architecture.

 * [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
 * [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
--- a/docs/conceptual/gpu-arch/mi100.md
+++ b/docs/conceptual/gpu-arch/mi100.md
@@ -1,3 +1,9 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD Instinct MI100 microarchitecture">
+  <meta name="keywords" content="Instinct, MI100, microarchitecture, AMD, ROCm">
+</head>
+
 # AMD Instinct™ MI100 microarchitecture

 The following image shows the node-level architecture of a system that
--- a/docs/conceptual/gpu-arch/mi200-performance-counters.md
+++ b/docs/conceptual/gpu-arch/mi200-performance-counters.md
@@ -1,455 +0,0 @@
-# MI200 performance counters and metrics
-<!-- markdownlint-disable no-duplicate-header -->
-
-This document lists and describes the hardware performance counters and the derived metrics available on the AMD Instinct™ MI200 GPU. All hardware performance monitors, and the derived performance metrics are accessible via AMD ROCm™ Profiler tool.
-
-## MI200 performance counters list
-
-```{note}
-Preliminary validation of all MI200 performance counters is in progress. Those with “[*]” appended to the names require further evaluation.
-```
-
-### GRBM
-
-#### GRBM counters
-
-| Hardware Counter   | Unit   | Definition |
-|--------------------|--------| ------------------------------------------------------|
-| `grbm_count`       | Cycles | Free-running GPU clock |
-| `grbm_gui_active`  | Cycles | GPU active cycles |
-| `grbm_cp_busy`     | Cycles | Any of the command processor (CPC/CPF) blocks are busy. |
-| `grbm_spi_busy`    | Cycles | Any of the shader processor input (SPI) are busy in the shader engine(s). |
-| `grbm_ta_busy`     | Cycles | Any of the texture addressing unit are busy in the shader engine(s). |
-| `grbm_tc_busy`     | Cycles | Any of the texture cache blocks (TCP/TCI/TCA/TCC) are busy. |
-| `grbm_cpc_busy`    | Cycles | The command processor - compute (CPC) is busy. |
-| `grbm_cpf_busy`    | Cycles | The command processor - fetcher (CPF) is busy. |
-| `grbm_utcl2_busy`  | Cycles | The unified translation cache - level 2 (UTCL2) block is busy. |
-| `grbm_ea_busy`     | Cycles | The efficiency arbiter (EA) block is busy. |
-
-### Command processor
-
-The command processor counters are further classified into fetcher and compute.
-
-#### CPF
-
-##### CPF counters
-
-| Hardware Counter                     | Unit   | Definition                                                   |
-|--------------------------------------|--------|--------------------------------------------------------------|
-| `cpf_cmp_utcl1_stall_on_translation` | Cycles | One of the compute UTCL1s is stalled waiting on translation. |
-| `cpf_cpf_stat_idle[∗]`               | Cycles | CPF idle                                                   |
-| `cpf_cpf_stat_stall`                 | Cycles | CPF stall                                                  |
-| `cpf_cpf_tciu_busy`                  | Cycles | CPF TCIU interface busy                                    |
-| `cpf_cpf_tciu_idle`                  | Cycles | CPF TCIU interface idle                                    |
-| `cpf_cpf_tciu_stall[∗]`              | Cycles | CPF TCIU interface is stalled waiting on free tags.        |
-
-#### CPC
-
-##### CPC counters
-
-| Hardware Counter                 | Unit   | Definition                                          |
-| ---------------------------------| -------| --------------------------------------------------- |
-| `cpc_me1_busy_for_packet_decode` | Cycles | CPC ME1 busy decoding packets                       |
-| `cpc_utcl1_stall_on_translation` | Cycles | One of the UTCL1s is stalled waiting on translation |
-| `cpc_cpc_stat_busy`              | Cycles | CPC busy                                            |
-| `cpc_cpc_stat_idle`              | Cycles | CPC idle                                            |
-| `cpc_cpc_stat_stall`             | Cycles | CPC stalled                                         |
-| `cpc_cpc_tciu_busy`              | Cycles | CPC TCIU interface busy                             |
-| `cpc_cpc_tciu_idle`              | Cycles | CPC TCIU interface idle                             |
-| `cpc_cpc_utcl2iu_busy`           | Cycles | CPC UTCL2 interface busy                            |
-| `cpc_cpc_utcl2iu_idle`           | Cycles | CPC UTCL2 interface idle                            |
-| `cpc_cpc_utcl2iu_stall[∗]`       | Cycles | CPC UTCL2 interface stalled waiting                 |
-| `cpc_me1_dci0_spi_busy`          | Cycles | CPC ME1 Processor busy                              |
-
-### SPI
-
-#### SPI counters
-
-| Hardware Counter             | Unit        | Definition                                                   |
-| :----------------------------| :-----------| -----------------------------------------------------------: |
-| `spi_csn_busy`                 | Cycles      | Number of clocks with outstanding waves                      |
-| `spi_csn_window_valid`         | Cycles      | Clock count enabled by perfcounter_start event               |
-| `spi_csn_num_threadgroups`     | Workgroups  | Total number of dispatched workgroups                        |
-| `spi_csn_wave`                 | Wavefronts  | Total number of dispatched wavefronts                        |
-| `spi_ra_req_no_alloc`          | Cycles      | Arb cycles with requests but no allocation (need to multiply this value by 4) |
-|`spi_ra_req_no_alloc_csn`       | Cycles      | Arb cycles with CSn req and no CSn alloc (need to multiply this value by 4) |
-| `spi_ra_res_stall_csn`         | Cycles      | Arb cycles with CSn req and no CSn fits (need to multiply this value by 4) |
-| `spi_ra_tmp_stall_csn[∗]`      | Cycles      | Cycles where CSn wants to req but does not fit in temp space |
-| `spi_ra_wave_simd_full_csn`    | SIMD-cycles | Sum of SIMD where WAVE cannot take csn wave when not fits    |
-| `spi_ra_vgpr_simd_full_csn[∗]` | SIMD-cycles | Sum of SIMD where VGPR cannot take csn wave when not fits    |
-| `spi_ra_sgpr_simd_full_csn[∗]` | SIMD-cycles | Sum of SIMD where SGPR cannot take csn wave when not fits    |
-| `spi_ra_lds_cu_full_csn`       | CUs         | Sum of CU where LDS cannot take csn wave when not fits       |
-| `spi_ra_bar_cu_full_csn[∗]`    | CUs         | Sum of CU where BARRIER cannot take csn wave when not fits   |
-| `spi_ra_bulky_cu_full_csn[∗]`  | CUs         | Sum of CU where BULKY cannot take csn wave when not fits     |
-| `spi_ra_tglim_cu_full_csn[∗]`  | Cycles      | Cycles where csn wants to req but all CUs are at tg_limit    |
-| `spi_ra_wvlim_cu_full_csn[∗]`  | Cycles      | Number of clocks csn is stalled due to WAVE LIMIT            |
-| `spi_vwc_csc_wr`               | Cycles      | Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) |
-| `spi_swc_csc_wr`               | Cycles      | Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) |
-
-### Compute unit
-
-The compute unit counters are further classified into instruction mix, MFMA operation counters, level counters, wavefront counters, wavefront cycle counters, local data share counters, and others.
-
-#### Instruction mix
-
-| Hardware Counter        | Unit   | Definition                                                               |
-| :-----------------------| :-----:| -----------------------------------------------------------------------: |
-| `sq_insts`                | Instr | Number of instructions issued                                             |
-| `sq_insts_valu`           | Instr | Number of VALU instructions issued, including MFMA                        |
-| `sq_insts_valu_add_f16`   | Instr | Number of VALU F16 Add instructions issued                                |
-| `sq_insts_valu_mul_f16`   | Instr | Number of VALU F16 Multiply instructions issued                           |
-| `sq_insts_valu_fma_f16`   | Instr | Number of VALU F16 FMA instructions issued                                |
-| `sq_insts_valu_trans_f16` | Instr | Number of VALU F16 Transcendental instructions issued                     |
-| `sq_insts_valu_add_f32`   | Instr | Number of VALU F32 Add instructions issued                                |
-| `sq_insts_valu_mul_f32`   | Instr | Number of VALU F32 Multiply instructions issued                           |
-| `sq_insts_valu_fma_f32`   | Instr | Number of VALU F32 FMA instructions issued                                |
-| `sq_insts_valu_trans_f32` | Instr | Number of VALU F32 Transcendental instructions issued                     |
-| `sq_insts_valu_add_f64`   | Instr | Number of VALU F64 Add instructions issued                                |
-| `sq_insts_valu_mul_f64`   | Instr | Number of VALU F64 Multiply instructions issued                           |
-| `sq_insts_valu_fma_f64`   | Instr | Number of VALU F64 FMA instructions issued                                |
-| `sq_insts_valu_trans_f64` | Instr | Number of VALU F64 Transcendental instructions issued                     |
-| `sq_insts_valu_int32`     | Instr | Number of VALU 32-bit integer instructions issued (signed or unsigned)    |
-| `sq_insts_valu_int64`     | Instr | Number of VALU 64-bit integer instructions issued (signed or unsigned)    |
-| `sq_insts_valu_cvt`       | Instr | Number of VALU Conversion instructions issued                             |
-| `sq_insts_valu_mfma_i8`   | Instr | Number of 8-bit Integer MFMA instructions issued                          |
-| `sq_insts_valu_mfma_f16`  | Instr | Number of F16 MFMA instructions issued                                    |
-| `sq_insts_valu_mfma_bf16` | Instr | Number of BF16 MFMA instructions issued                                   |
-| `sq_insts_valu_mfma_f32`  | Instr | Number of F32 MFMA instructions issued                                    |
-| `sq_insts_valu_mfma_f64`  | Instr | Number of F64 MFMA instructions issued                                    |
-| `sq_insts_mfma`           | Instr | Number of MFMA instructions issued                                        |
-| `sq_insts_vmem_wr`        | Instr | Number of VMEM write instructions issued                                  |
-| `sq_insts_vmem_rd`        | Instr | Number of VMEM read instructions issued                                   |
-| `sq_insts_vmem`           | Instr | Number of VMEM instructions issued, including both FLAT and buffer instructions |
-| `sq_insts_salu`           | Instr | Number of SALU instructions issued                                        |
-| `sq_insts_smem`           | Instr | Number of SMEM instructions issued                                        |
-| `sq_insts_smem_norm`      | Instr | Number of SMEM instructions issued to normalize to match `smem_level`. Used in measuring SMEM latency |
-| `sq_insts_flat`           | Instr | Number of FLAT instructions issued                                        |
-| `sq_insts_flat_lds_only`  | Instr | Number of FLAT instructions issued that read/write only from/to LDS       |
-| `sq_insts_lds`            | Instr | Number of LDS instructions issued                                         |
-| `sq_insts_gds`            | Instr | Number of GDS instructions issued                                         |
-| `sq_insts_exp_gds`        | Instr | Number of EXP and GDS instructions excluding skipped export instructions issued |
-| `sq_insts_branch`         | Instr | Number of Branch instructions issued                                      |
-| `sq_insts_sendmsg`        | Instr | Number of SENDMSG instructions including s_endpgm issued                  |
-| `sq_insts_vskipped[∗]`    | Instr | Number of VSkipped instructions issued                                    |
-
-#### MFMA operation counters
-
-| Hardware Counter             | Unit  | Definition                                      |
-| :----------------------------| :-----| ----------------------------------------------: |
-| `sq_insts_valu_mfma_mops_I8`   | IOP   | Number of 8-bit integer MFMA ops in unit of 512 |
-| `sq_insts_valu_mfma_mops_F16`  | FLOP  | Number of F16 floating MFMA ops in unit of 512  |
-| `sq_insts_valu_mfma_mops_BF16` | FLOP  | Number of BF16 floating MFMA ops in unit of 512 |
-| `sq_insts_valu_mfma_mops_F32`  | FLOP  | Number of F32 floating MFMA ops in unit of 512  |
-| `sq_insts_valu_mfma_mops_F64`  | FLOP  | Number of F64 floating MFMA ops in unit of 512  |
-
-#### Level counters
-
-| Hardware Counter    | Unit  | Definition                             |
-| :-------------------| :-----| -------------------------------------: |
-| `sq_accum_prev`       | Count | Accumulated counter sample value where accumulation takes place once every  four cycles |
-| `sq_accum_prev_hires` | Count | Accumulated counter sample value where accumulation takes place once every cycle |
-| `sq_level_waves`      | Waves | Number of inflight waves               |
-| `sq_insts_level_vmem` | Instr | Number of inflight VMEM instructions   |
-| `sq_insts_level_smem` | Instr | Number of inflight SMEM instructions   |
-| `sq_insts_level_lds`  | Instr | Number of inflight LDS instructions    |
-| `sq_ifetch_level`     | Instr | Number of inflight instruction fetches |
-
-#### Wavefront counters
-
-| Hardware Counter     | Unit  | Definition                                                        |
-| :--------------------| :-----| ----------------------------------------------------------------: |
-| `sq_waves`             | Waves | Number of wavefronts dispatch to SQs, including both new and restored wavefronts |
-| `sq_waves_saved[∗]`    | Waves | Number of context-saved wavefronts                                |
-| `sq_waves_restored[∗]` | Waves | Number of context-restored wavefronts                             |
-| `sq_waves_eq_64`       | Waves | Number of wavefronts with exactly 64 active threads sent to SQs   |
-| `sq_waves_lt_64`       | Waves | Number of wavefronts with less than 64 active threads sent to SQs |
-| `sq_waves_lt_48`       | Waves | Number of wavefronts with less than 48 active threads sent to SQs |
-| `sq_waves_lt_32`       | Waves | Number of wavefronts with less than 32 active threads sent to SQs |
-| `sq_waves_lt_16`       | Waves | Number of wavefronts with less than 16 active threads sent to SQs |
-
-#### Wavefront cycle counters
-
-| Hardware Counter         | Unit    | Definition                                                            |
-| :------------------------| :-------| --------------------------------------------------------------------: |
-| `sq_cycles`                | Cycles  | Free-running  SQ clocks                                               |
-| `sq_busy_cycles`           | Cycles  | Number of cycles while SQ reports it to be busy                       |
-| `sq_busy_cu_cycles`        | Qcycles | Number of quad cycles each CU is busy                                 |
-| `sq_valu_mfma_busy_cycles` | Cycles  | Number of cycles the MFMA ALU is busy                                 |
-| `sq_wave_cycles`           | Qcycles | Number of quad cycles spent by waves in the CUs                       |
-| `sq_wait_any`              | Qcycles | Number of quad cycles spent waiting for anything                      |
-| `sq_wait_inst_any`         | Qcycles | Number of quad cycles spent waiting for an issued instruction         |
-| `sq_active_inst_any`       | Qcycles | Number of quad cycles spent by each wave to work on an instruction    |
-| `sq_active_inst_vmem`      | Qcycles | Number of quad cycles spent by each wave to work on a non-FLAT VMEM instruction |
-| `sq_active_inst_lds`       | Qcycles | Number of quad cycles spent by each wave to work on an LDS instruction |
-| `sq_active_inst_valu`      | Qcycles | Number of quad cycles spent by each wave to work on a VALU instruction |
-| `sq_active_inst_sca`       | Qcycles | Number of quad cycles spent by each wave to work on an SCA instruction |
-| `sq_active_inst_exp_gds`   | Qcycles | Number of quad cycles spent by each wave to work on EXP or GDS instruction |
-| `sq_active_inst_misc`      | Qcycles | Number of quad cycles spent by each wave to work on an MISC instruction, including branch and sendmsg |
-| `sq_active_inst_flat`      | Qcycles | Number of quad cycles spent by each wave to work on a FLAT instruction |
-| `sq_inst_cycles_vmem_wr`   | Qcycles | Number of quad cycles  spent to send addr and cmd data for VMEM write instructions, including both FLAT and buffer |
-| `sq_inst_cycles_vmem_rd`   | Qcycles | Number of quad cycles  spent to send addr and cmd data for VMEM read instructions, including both FLAT and buffer |
-| `sq_inst_cycles_smem`      | Qcycles | Number of quad cycles  spent to execute scalar memory reads           |
-| `sq_inst_cycles_salu`      | Cycles  | Number of cycles spent to execute non-memory read scalar operations   |
-| `sq_thread_cycles_valu`    | Cycles  | Number of thread cycles spent to execute VALU operations              |
-
-#### Local data share
-
-| Hardware Counter           | Unit   | Definition                                                |
-| :--------------------------| :------| --------------------------------------------------------: |
-| `sq_lds_atomic_return`       | Cycles | Number of atomic return cycles in LDS                     |
-| `sq_lds_bank_conflict`       | Cycles | Number of cycles LDS is stalled by bank conflicts         |
-| `sq_lds_addr_conflict[∗]`    | Cycles | Number of cycles LDS is stalled by address conflicts      |
-| `sq_lds_unaligned_stalls[∗]` | Cycles | Number of cycles LDS is stalled processing flat unaligned load/store ops |
-| `sq_lds_mem_violations[∗]`   | Count  | Number of threads that have a memory violation in the LDS |
-
-#### Miscellaneous
-
-##### Local data share
-
-| Hardware Counter | Unit    | Definition                                                |
-| :----------------| :-------| --------------------------------------------------------: |
-| `sq_ifetch`        | Count   | Number of fetch requests from L1I cache, in 32-byte width |
-| `sq_items`         | Threads | Number of valid threads                                   |
-
-### L1I and sL1D caches
-
-#### L1I and sL1D caches
-
-| Hardware Counter             | Unit   | Definition                                                        |
-| :----------------------------| :------| ----------------------------------------------------------------: |
-| `sqc_icache_req`               | Req    | Number of L1I cache requests                                      |
-| `sqc_icache_hits`              | Count  | Number of L1I cache lookup-hits                                   |
-| `sqc_icache_misses`            | Count  | Number of L1I cache non-duplicate lookup-misses                   |
-| `sqc_icache_misses_duplicate`  | Count  | Number of d L1I cache duplicate lookup misses  whose previous lookup miss on the same cache line is not fulfilled yet |
-| `sqc_dcache_req`               | Req    | Number of sL1D cache requests                                       |
-| `sqc_dcache_input_valid_readb` | Cycles | Number of cycles while SQ input is valid but sL1D cache is not ready |
-| `sqc_dcache_hits`              | Count  | Number of sL1D cache lookup-hits                                  |
-| `sqc_dcache_misses`            | Count  | Number of sL1D non-duplicate lookup-misses                        |
-| `sqc_dcache_misses_duplicate`  | Count  | Number of sL1D duplicate lookup-misses                            |
-| `sqc_dcache_req_read_1`        | Req    | Number of read requests in a single 32-bit data word, DWORD (DW)  |
-| `sqc_dcache_req_read_2`        | Req    | Number of read requests in 2 DW                                   |
-| `sqc_dcache_req_read_4`        | Req    | Number of read requests in 4 DW                                   |
-| `sqc_dcache_req_read_8`        | Req    | Number of read requests in 8 DW                                   |
-| `sqc_dcache_req_read_16`       | Req    | Number of read requests in 16 DW                                  |
-| `sqc_dcache_atomic[∗]`         | Req    | Number of atomic requests                                         |
-| `sqc_tc_req`                   | Req    | Number of L2 cache requests that were issued by instruction and constant caches |
-| `sqc_tc_inst_req`              | Req    | Number of instruction cache line requests to L2 cache             |
-| `sqc_tc_data_read_req`         | Req    | Number of data read requests to the L2 cache                      |
-| `sqc_tc_data_write_req[∗]`     | Req    | Number of data write requests to the L2 cache                     |
-| `sqc_tc_data_atomic_req[∗]`    | Req    | Number of data atomic requests to the L2 cache                    |
-| `sqc_tc_stall[∗]`              | Cycles | Number of cycles while the valid requests to L2 cache are stalled |
-
-### Vector L1 cache subsystem
-
-The vector L1 cache subsystem counters are further classified into texture addressing unit, texture data unit, vector L1D cache, and texture cache arbiter.
-
-#### Texture addressing unit
-
-##### Texture addressing unit counters
-
-| Hardware Counter                 | Unit   | Definition                                        |
-| :--------------------------------| :------| ------------------------------------------------: |
-| `ta_ta_busy`                       | Cycles | texture addressing unit busy cycles                                    |
-| `ta_total_wavefronts`              | Instr  | Number of wavefront instructions                  |
-| `ta_buffer_wavefronts`             | Instr  | Number of buffer wavefront instructions           |
-| `ta_buffer_read_wavefronts`        | Instr  | Number of buffer read wavefront instructions      |
-| `ta_buffer_write_wavefronts`       | Instr  | Number of buffer write wavefront instructions     |
-| `ta_buffer_atomic_wavefronts[∗]`   | Instr  | Number of buffer atomic wavefront instructions    |
-| `ta_buffer_total_cycles`           | Cycles | Number of buffer cycles, including read and write |
-| `ta_buffer_coalesced_read_cycles`  | Cycles | Number of coalesced buffer read cycles            |
-| `ta_buffer_coalesced_write_cycles` | Cycles | Number of coalesced buffer write cycles           |
-| `ta_addr_stalled_by_tc`            | Cycles | Number of cycles texture addressing unit address is stalled by TCP     |
-| `ta_data_stalled_by_tc`            | Cycles | Number of cycles texture addressing unit data is stalled by TCP        |
-| `ta_addr_stalled_by_td_cycles[∗]`  | Cycles | Number of cycles texture addressing unit address is stalled by TD      |
-| `ta_flat_wavefronts`               | Instr  | Number of flat wavefront instructions             |
-| `ta_flat_read_wavefronts`          | Instr  | Number of flat read wavefront instructions        |
-| `ta_flat_write_wavefronts`         | Instr  | Number of flat write wavefront instructions       |
-| `ta_flat_atomic_wavefronts`        | Instr  | Number of flat atomic wavefront instructions      |
-
-#### Texture data unit
-
-##### Texture data unit counters
-
-| Hardware Counter         | Unit  | Definition                                           |
-| :------------------------| :-----| ---------------------------------------------------: |
-| `td_td_busy`               | Cycle | TD busy cycles                                       |
-| `td_tc_stall`              | Cycle | Number of cycles TD is stalled by TCP                |
-| `td_spi_stall[∗]`          | Cycle | Number of cycles TD is stalled by SPI                |
-| `td_load_wavefront`        | Instr | Number of wavefront instructions (read/write/atomic) |
-| `td_store_wavefront`       | Instr | Number of write wavefront instructions               |
-| `td_atomic_wavefront`      | Instr | Number of atomic wavefront instructions              |
-| `td_coalescable_wavefront` | Instr | Number of coalescable instructions                   |
-
-#### Vector L1D cache
-
-| Hardware Counter                    | Unit   | Definition                                                  |
-| :-----------------------------------| :------| ----------------------------------------------------------: |
-| `tcp_gate_en1`                        | Cycles | Number of cycles/ vL1D interface clocks are turned on    |
-| `tcp_gate_en2`                        | Cycles | Number of cycles vL1D core clocks are turned on           |
-| `tcp_td_tcp_stall_cycles`             | Cycles | Number of cycles TD stalls vL1D                           |
-| `tcp_tcr_tcp_stall_cycles`            | Cycles | Number of cycles TCR stalls vL1D                           |
-| `tcp_read_tagconflict_stall_cycles`   | Cycles | Number of cycles tagram conflict stalls on a read          |
-| `tcp_write_tagconflict_stall_cycles`  | Cycles | Number of cycles tagram conflict stalls on a write         |
-| `tcp_atomic_tagconflict_stall_cycles` | Cycles | Number of cycles tagram conflict stalls on an atomic       |
-| `tcp_pending_stall_cycles`            | Cycles | Number of cycles vL1D cache is stalled due to data pending from L2 cache |
-| `tcp_ta_tcp_state_read`               | Req    | Number of wavefront instruction requests to vL1D           |
-| `tcp_volatile[∗]`                     | Req    | Number of L1 volatile pixels/buffers from texture addressing unit               |
-| `tcp_total_accesses`                  | Req    | Number of vL1D accesses                                    |
-| `tcp_total_read`                      | Req    | Number of vL1D read accesses                               |
-| `tcp_total_write`                     | Req    | Number of vL1D write accesses                              |
-| `tcp_total_atomic_with_ret`           | Req    | Number of vL1D atomic with return                          |
-| `tcp_total_atomic_without_ret`        | Req    | Number of vL1D atomic without return                       |
-| `tcp_total_writeback_invalidates`     | Count  | Number of vL1D writebacks and Invalidates                  |
-| `tcp_utcl1_request`                   | Req    | Number of address translation requests to UTCL1            |
-| `tcp_utcl1_translation_hit`           | Req    | Number of UTCL1 translation hits                            |
-| `tcp_utcl1_translation_miss`          | Req    | Number of UTCL1 translation misses                          |
-| `tcp_utcl1_persmission_miss`          | Req    | Number of UTCL1 permission misses                           |
-| `tcp_total_cache_accesses`            | Req    | Number of vL1D cache accesses                               |
-| `tcp_tcp_latency`                     | Cycles | Accumulated wave access latency to vL1D over all wavefronts |
-| `tcp_tcc_read_req_latency`            | Cycles | Accumulated vL1D-L2 request latency over all wavefronts for reads and atomics with return |
-| `tcp_tcc_write_req_latency`           | Cycles | Accumulated vL1D-L2 request latency over all wavefronts for writes and atomics without return |
-| `tcp_tcc_read_req`                    | Req    | Number of read requests to L2 cache                        |
-| `tcp_tcc_write_req`                   | Req    | Number of write requests to L2 cache                       |
-| `tcp_tcc_atomic_with_ret_req`         | Req    | Number of atomic requests to L2 cache with return          |
-| `tcp_tcc_atomic_without_ret_req`      | Req    | Number of atomic requests to L2 cache without return       |
-| `tcp_tcc_nc_read_req`                 | Req    | Number of NC read requests to L2 cache                     |
-| `tcp_tcc_uc_read_req`                 | Req    | Number of UC read requests to L2 cache                     |
-| `tcp_tcc_cc_read_req`                 | Req    | Number of CC read requests to L2 cache                     |
-| `tcp_tcc_rw_read_req`                 | Req    | Number of RW read requests to L2 cache                     |
-| `tcp_tcc_nc_write_req`                | Req    | Number of NC write requests to L2 cache                    |
-| `tcp_tcc_uc_write_req`                | Req    | Number of UC write requests to L2 cache                    |
-| `tcp_tcc_cc_write_req`                | Req    | Number of CC write requests to L2 cache                    |
-| `tcp_tcc_rw_write_req`                | Req    | Number of RW write requests to L2 cache                    |
-| `tcp_tcc_nc_atomic_req`               | Req    | Number of NC atomic requests to L2 cache                   |
-| `tcp_tcc_uc_atomic_req`               | Req    | Number of UC atomic requests to L2 cache                   |
-| `tcp_tcc_cc_atomic_req`               | Req    | Number of CC atomic requests to L2 cache                   |
-| `tcp_tcc_rw_atomic_req`               | Req    | Number of RW atomic requests to L2 cache                   |
-
-#### TCA
-
-| Hardware Counter | Unit   | Definition                                  |
-| :----------------| :------| ------------------------------------------: |
-| `tca_cycle`        | Cycles | TCA cycles                                  |
-| `tca_busy`         | Cycles | Number of cycles  TCA has a pending request |
-
-### L2 cache access
-
-#### L2 cache access counters
-
-| Hardware Counter                 | Unit   | Definition                                                     |
-| :--------------------------------| :------| -------------------------------------------------------------: |
-| `tcc_cycle`                        |Cycle   | L2 cache free-running clocks                                  |
-| `tcc_busy`                         |Cycle   | L2 cache busy cycles                                          |
-| `tcc_req`                          |Req     | Number of L2 cache requests                                   |
-| `tcc_streaming_req[∗]`             |Req     | Number of L2 cache streaming requests                         |
-| `tcc_NC_req`                       |Req     | Number of NC requests                                         |
-| `tcc_UC_req`                       |Req     | Number of UC requests                                         |
-| `tcc_CC_req`                       |Req     | Number of CC requests                                         |
-| `tcc_RW_req`                       |Req     | Number of RW requests                                         |
-| `tcc_probe`                        |Req     | Number of L2 cache probe requests                             |
-| `tcc_probe_all[∗]`                 |Req     | Number of external probe requests with EA_TCC_preq_all== 1    |
-| `tcc_read_req`                     |Req     | Number of L2 cache read requests                              |
-| `tcc_write_req`                    |Req     | Number of L2 cache write requests                             |
-| `tcc_atomic_req`                   |Req     | Number of L2 cache atomic requests                            |
-| `tcc_hit`                          |Req     | Number of L2 cache lookup-hits                                |
-| `tcc_miss`                         |Req     | Number of L2 cache lookup-misses                              |
-| `tcc_writeback`                    |Req     | Number of lines written back to main memory, including writebacks of dirty lines and uncached write/atomic requests |
-| `tcc_ea_wrreq`                     |Req     | Total number of 32-byte and 64-byte write requests to EA      |
-| `tcc_ea_wrreq_64B`                 |Req     | Total number of 64-byte write requests to EA                  |
-| `tcc_ea_wr_uncached_32B`           |Req     | Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request is counted as 2. |
-| `tcc_ea_wrreq_stall`               | Cycles | Number of cycles a write request was stalled                  |
-| `tcc_ea_wrreq_io_credit_stall[∗]`  | Cycles | Number of cycles an EA write request runs out of IO credits   |
-| `tcc_ea_wrreq_gmi_credit_stall[∗]` | Cycles | Number of cycles an EA write request runs out of GMI credits  |
-| `tcc_ea_wrreq_dram_credit_stall`   | Cycles | Number of cycles an EA write request runs out of DRAM credits |
-| `tcc_too_many_ea_wrreqs_stall[∗]`  | Cycles | Number of cycles the L2 cache reaches maximum number of pending EA write requests |
-| `tcc_ea_wrreq_level`               | Req    | Accumulated number of L2 cache-EA write requests in flight    |
-| `tcc_ea_atomic`                    | Req    | Number of 32-byte and 64-byte atomic requests to EA           |
-| `tcc_ea_atomic_level`              | Req    | Accumulated number of L2 cache-EA atomic requests in flight   |
-| `tcc_ea_rdreq`                     | Req    | Total number of 32-byte and 64-byte read requests to EA       |
-| `tcc_ea_rdreq_32B`                 | Req    | Total number of 32-byte read requests to EA                   |
-| `tcc_ea_rd_uncached_32B`           | Req    | Number of 32-byte L2 cache-EA read due to uncached traffic. A 64-byte request is counted as 2. |
-| `tcc_ea_rdreq_io_credit_stall[∗]`  | Cycles | Number of cycles read request interface runs out of IO credits  |
-| `tcc_ea_rdreq_gmi_credit_stall[∗]` | Cycles | Number of cycles read request interface runs out of GMI credits |
-| `tcc_ea_rdreq_dram_credit_stall`   | Cycles | Number of cycles read request interface runs out of DRAM credits |
-| `tcc_ea_rdreq_level`               | Req    | Accumulated number of L2 cache-EA read requests in flight     |
-| `tcc_ea_rdreq_dram`                | Req    | Number of 32-byte and 64-byte read requests to HBM            |
-| `tcc_ea_wrreq_dram`                | Req    | Number of 32-byte and 64-byte write requests to HBM           |
-| `tcc_tag_stall`                    | Cycles | Number of cycles the normal request pipeline in the tag was stalled for any reason |
-| `tcc_normal_writeback`             | Req    | Number of L2 cache normal writeback                           |
-| `tcc_all_tc_op_wb_writeback[∗]`    | Req    | Number of instruction-triggered writeback requests            |
-| `tcc_normal_evict`                 | Req    | Number of L2 cache normal evictions                           |
-| `tcc_all_tc_op_inv_evict[∗]`       | Req    | Number of instruction-triggered eviction requests             |
-
-## MI200 derived metrics list
-
-### Derived metrics on MI200 GPUs
-
-| Derived Metric   | Description                                                                            |
-| :----------------| -------------------------------------------------------------------------------------: |
-| `VFetchInsts`      | The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory               |
-| `VWriteInsts`      | The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory                 |
-| `FlatVMemInsts`    | The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch |
-| `LDSInsts`         | The average number of LDS read/write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS |
-| `FlatLDSInsts`     | The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control) |
-| `VALUUtilization`  | The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence) |
-| `VALUBusy`         | The percentage of GPU time vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal) |
-| `SALUBusy`         | The percentage of GPU time scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal) |
-| `MemWrites32B`     | The total number of effective 32B write transactions to the memory                      |
-| `L2CacheHit`       | The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal) |
-| `MemUnitStalled`   | The percentage of GPU time the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad) |
-| `WriteUnitStalled` | The percentage of GPU time the write unit is stalled. Value range: 0% to 100% (bad)      |
-| `LDSBankConflict`  | The percentage of GPU time LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad) |
-
-## MI200 acronyms
-
-| Abbreviation | Meaning                                                                           |
-| :------------| --------------------------------------------------------------------------------: |
-| `ALU`          | Arithmetic logic unit |
-| `Arb`          | Arbiter |
-| `BF16`        | Brain floating point – 16 |
-| `CC`           | Coherently cached |
-| `CP`           | Command processor |
-| `CPC`         | Command processor – compute |
-| `CPF`         | Command processor – fetcher |
-| `CS`           | Compute shader |
-| `CSC`         | Compute shader controller |
-| `CSn`          | Compute Shader, the n-th pipe |
-| `CU`           | Compute unit |
-| `DW`           | 32-bit data word, DWORD |
-| `EA`           | Efficiency arbiter |
-| `F16`          | Half-precision floating point |
-| `FLAT`       | FLAT instructions allow read/write/atomic access to a generic memory address pointer, which can resolve to any of the following physical memories:<br>•   Global Memory<br>•   Scratch (“private”)<br>•   LDS (“shared”)<br>•   Invalid – MEM_VIOL TrapStatus |
-| `FMA`          | Fused multiply-add |
-| `GDS`          | Global data share |
-| `GRBM`         | Graphics register bus manager |
-| `HBM`          | High bandwidth memory |
-| `Instr`        | Instructions |
-| `IOP`          | Integer operation |
-| `L2`           | Level-2 cache |
-| `LDS`          | Local data share |
-| `ME1`          | Micro-engine, running packet processing firmware on CPC |
-| `MFMA`         | Matrix fused multiply-add |
-| `NC`           | Noncoherently cached |
-| `RW`           | Coherently cached with write |
-| `SALU`         | Scalar ALU |
-| `SGPR`         | Scalar GPR |
-| `SIMD`         | Single instruction multiple data |
-| `sL1D`         | Scalar Level-1 data cache |
-| `SMEM`         | Scalar memory |
-| `SPI`          | Shader processor input |
-| `SQ`           | Sequencer |
-| `TA`           | Texture addressing unit |
-| `TC`           | Texture cache |
-| `TCA`          | Texture cache arbiter |
-| `TCC`          | Texture cache per channel, known as L2 cache |
-| `TCIU`         | Texture cache interface unit, command processor’s interface to memory system |
-| `TCP`          | Texture cache per pipe, known as vector L1 cache |
-| `TCR`          | Texture cache router |
-| `TD`           | Texture data unit |
-| `UC`           | Uncached |
-| `UTCL1`        | Unified translation cache – level 1 |
-| `UTCL2`        | Unified translation cache – level 2 |
-| `VALU`         | Vector ALU |
-| `VGPR`         | Vector GPR |
-| `vL1D`         | Vector level 1 data cache |
-| `VMEM`         | Vector memory |
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -1,3 +1,9 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD Instinct MI250 microarchitecture">
+  <meta name="keywords" content="Instinct, MI250, microarchitecture, AMD, ROCm">
+</head>
+
 # AMD Instinct™ MI250 microarchitecture

 The microarchitecture of the AMD Instinct MI250 accelerators is based on the
@@ -27,8 +33,8 @@ Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
 subdivided into four SIMD units that process SIMD instructions of 16 data
 elements per instruction (for the FP64 data type). This enables the CU to
 process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
-GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
-TFLOPS for vector instructions. The MI250 compute units also provide specialized
+GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
+TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
 execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -0,0 +1,758 @@
+.. meta::
+  :description: MI300 and MI200 series performance counters and metrics
+  :keywords: MI300, MI200, performance counters, command processor counters
+
+***************************************************************************************************
+MI300 and MI200 series performance counters and metrics
+***************************************************************************************************
+
+This document lists and describes the hardware performance counters and derived metrics available
+for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
+:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
+
+MI300 and MI200 series performance counters
+===============================================================
+
+Series performance counters include the following categories:
+
+* :ref:`command-processor-counters`
+* :ref:`graphics-register-bus-manager-counters`
+* :ref:`spi-counters`
+* :ref:`compute-unit-counters`
+* :ref:`l1i-and-sl1d-cache-counters`
+* :ref:`vector-l1-cache-subsystem-counters`
+* :ref:`l2-cache-access-counters`
+
+The following sections provide additional details for each category.
+
+.. note::
+
+  Preliminary validation of all MI300 and MI200 series performance counters is in progress. Those with
+  an asterisk (*) require further evaluation.
+
+.. _command-processor-counters:
+
+Command processor counters
+---------------------------------------------------------------------------------------------------------------
+
+Command processor counters are further classified into command processor-fetcher and command
+processor-compute.
+
+Command processor-fetcher counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``CPF_CMP_UTCL1_STALL_ON_TRANSLATION``", "Cycles", "Number of cycles one of the compute unified translation caches (L1) is stalled waiting on translation"
+  "``CPF_CPF_STAT_BUSY``", "Cycles", "Number of cycles command processor-fetcher is busy"
+  "``CPF_CPF_STAT_IDLE``", "Cycles", "Number of cycles command processor-fetcher is idle"
+  "``CPF_CPF_STAT_STALL``", "Cycles", "Number of cycles command processor-fetcher is stalled"
+  "``CPF_CPF_TCIU_BUSY``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is busy"
+  "``CPF_CPF_TCIU_IDLE``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is idle"
+  "``CPF_CPF_TCIU_STALL``", "Cycles", "Number of cycles command processor-fetcher texture cache interface unit interface is stalled waiting on free tags"
+
+The texture cache interface unit is the interface between the command processor and the memory
+system.
+
+Command processor-compute counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``CPC_ME1_BUSY_FOR_PACKET_DECODE``", "Cycles", "Number of cycles command processor-compute micro engine is busy decoding packets"
+  "``CPC_UTCL1_STALL_ON_TRANSLATION``", "Cycles", "Number of cycles one of the unified translation caches (L1) is stalled waiting on translation"
+  "``CPC_CPC_STAT_BUSY``", "Cycles", "Number of cycles command processor-compute is busy"
+  "``CPC_CPC_STAT_IDLE``", "Cycles", "Number of cycles command processor-compute is idle"
+  "``CPC_CPC_STAT_STALL``", "Cycles", "Number of cycles command processor-compute is stalled"
+  "``CPC_CPC_TCIU_BUSY``", "Cycles", "Number of cycles command processor-compute texture cache interface unit interface is busy"
+  "``CPC_CPC_TCIU_IDLE``", "Cycles", "Number of cycles command processor-compute texture cache interface unit interface is idle"
+  "``CPC_CPC_UTCL2IU_BUSY``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is busy"
+  "``CPC_CPC_UTCL2IU_IDLE``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is idle"
+  "``CPC_CPC_UTCL2IU_STALL``", "Cycles", "Number of cycles command processor-compute unified translation cache (L2) interface is stalled"
+  "``CPC_ME1_DC0_SPI_BUSY``", "Cycles", "Number of cycles command processor-compute micro engine processor is busy"
+
+The micro engine runs packet-processing firmware on the command processor-compute counter.
+
+.. _graphics-register-bus-manager-counters:
+
+Graphics register bus manager counters
+---------------------------------------------------------------------------------------------------------------
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``GRBM_COUNT``", "Cycles","Number of free-running GPU cycles"
+  "``GRBM_GUI_ACTIVE``", "Cycles", "Number of GPU active cycles"
+  "``GRBM_CP_BUSY``", "Cycles", "Number of cycles any of the command processor blocks are busy"
+  "``GRBM_SPI_BUSY``", "Cycles", "Number of cycles any of the shader processor input is busy in the shader engines"
+  "``GRBM_TA_BUSY``", "Cycles", "Number of cycles any of the texture addressing unit is busy in the shader engines"
+  "``GRBM_TC_BUSY``", "Cycles", "Number of cycles any of the texture cache blocks are busy"
+  "``GRBM_CPC_BUSY``", "Cycles", "Number of cycles the command processor-compute is busy"
+  "``GRBM_CPF_BUSY``", "Cycles", "Number of cycles the command processor-fetcher is busy"
+  "``GRBM_UTCL2_BUSY``", "Cycles", "Number of cycles the unified translation cache (Level 2 [L2]) block is busy"
+  "``GRBM_EA_BUSY``", "Cycles", "Number of cycles the efficiency arbiter block is busy"
+
+Texture cache blocks include:
+
+* Texture cache arbiter
+* Texture cache per pipe, also known as vector Level 1 (L1) cache
+* Texture cache per channel, also known as known as L2 cache
+* Texture cache interface
+
+.. _spi-counters:
+
+Shader processor input counters
+---------------------------------------------------------------------------------------------------------------
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SPI_CSN_BUSY``", "Cycles", "Number of cycles with outstanding waves"
+  "``SPI_CSN_WINDOW_VALID``", "Cycles", "Number of cycles enabled by ``perfcounter_start`` event"
+  "``SPI_CSN_NUM_THREADGROUPS``", "Workgroups", "Number of dispatched workgroups"
+  "``SPI_CSN_WAVE``", "Wavefronts", "Number of dispatched wavefronts"
+  "``SPI_RA_REQ_NO_ALLOC``", "Cycles", "Number of arbiter cycles with requests but no allocation"
+  "``SPI_RA_REQ_NO_ALLOC_CSN``", "Cycles", "Number of arbiter cycles with compute shader (n\ :sup:`th` pipe) requests but no compute shader (n\ :sup:`th` pipe) allocation"
+  "``SPI_RA_RES_STALL_CSN``", "Cycles", "Number of arbiter stall cycles due to shortage of compute shader (n\ :sup:`th` pipe) pipeline slots"
+  "``SPI_RA_TMP_STALL_CSN``", "Cycles", "Number of stall cycles due to shortage of temp space"
+  "``SPI_RA_WAVE_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of single instruction, multiple data (SIMD) per cycle affected by shortage of wave slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
+  "``SPI_RA_VGPR_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of SIMDs per cycle affected by shortage of vector general-purpose register (VGPR) slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
+  "``SPI_RA_SGPR_SIMD_FULL_CSN``", "SIMD-cycles", "Accumulated number of SIMDs per cycle affected by shortage of scalar general-purpose register (SGPR) slots for compute shader (n\ :sup:`th` pipe) wave dispatch"
+  "``SPI_RA_LDS_CU_FULL_CSN``", "CU", "Number of compute units affected by shortage of local data share (LDS) space for compute shader (n\ :sup:`th` pipe) wave dispatch"
+  "``SPI_RA_BAR_CU_FULL_CSN``", "CU", "Number of compute units with compute shader (n\ :sup:`th` pipe) waves waiting at a BARRIER"
+  "``SPI_RA_BULKY_CU_FULL_CSN``", "CU", "Number of compute units with compute shader (n\ :sup:`th` pipe) waves waiting for BULKY resource"
+  "``SPI_RA_TGLIM_CU_FULL_CSN``", "Cycles", "Number of compute shader (n\ :sup:`th` pipe) wave stall cycles due to restriction of ``tg_limit`` for thread group size"
+  "``SPI_RA_WVLIM_STALL_CSN``", "Cycles", "Number of cycles compute shader (n\ :sup:`th` pipe) is stalled due to ``WAVE_LIMIT``"
+  "``SPI_VWC_CSC_WR``", "Qcycles", "Number of quad-cycles taken to initialize VGPRs when launching waves"
+  "``SPI_SWC_CSC_WR``", "Qcycles", "Number of quad-cycles taken to initialize SGPRs when launching waves"
+
+.. _compute-unit-counters:
+
+Compute unit counters
+---------------------------------------------------------------------------------------------------------------
+
+The compute unit counters are further classified into instruction mix, matrix fused multiply-add (FMA)
+operation counters, level counters, wavefront counters, wavefront cycle counters, and LDS counters.
+
+Instruction mix
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_INSTS``", "Instr", "Number of instructions issued"
+  "``SQ_INSTS_VALU``", "Instr", "Number of vector arithmetic logic unit (VALU) instructions including matrix FMA issued"
+  "``SQ_INSTS_VALU_ADD_F16``", "Instr", "Number of VALU half-precision floating-point (F16) ``ADD`` or ``SUB`` instructions issued"
+  "``SQ_INSTS_VALU_MUL_F16``", "Instr", "Number of VALU F16 Multiply instructions issued"
+  "``SQ_INSTS_VALU_FMA_F16``", "Instr", "Number of VALU F16 FMA or multiply-add instructions issued"
+  "``SQ_INSTS_VALU_TRANS_F16``", "Instr", "Number of VALU F16 Transcendental instructions issued"
+  "``SQ_INSTS_VALU_ADD_F32``", "Instr", "Number of VALU full-precision floating-point (F32) ``ADD`` or ``SUB`` instructions issued"
+  "``SQ_INSTS_VALU_MUL_F32``", "Instr", "Number of VALU F32 Multiply instructions issued"
+  "``SQ_INSTS_VALU_FMA_F32``", "Instr", "Number of VALU F32 FMAor multiply-add instructions issued"
+  "``SQ_INSTS_VALU_TRANS_F32``", "Instr", "Number of VALU F32 Transcendental instructions issued"
+  "``SQ_INSTS_VALU_ADD_F64``", "Instr", "Number of VALU F64 ``ADD`` or ``SUB`` instructions issued"
+  "``SQ_INSTS_VALU_MUL_F64``", "Instr", "Number of VALU F64 Multiply instructions issued"
+  "``SQ_INSTS_VALU_FMA_F64``", "Instr", "Number of VALU F64 FMA or multiply-add instructions issued"
+  "``SQ_INSTS_VALU_TRANS_F64``", "Instr", "Number of VALU F64 Transcendental instructions issued"
+  "``SQ_INSTS_VALU_INT32``", "Instr", "Number of VALU 32-bit integer instructions (signed or unsigned) issued"
+  "``SQ_INSTS_VALU_INT64``", "Instr", "Number of VALU 64-bit integer instructions (signed or unsigned) issued"
+  "``SQ_INSTS_VALU_CVT``", "Instr", "Number of VALU Conversion instructions issued"
+  "``SQ_INSTS_VALU_MFMA_I8``", "Instr", "Number of 8-bit Integer matrix FMA instructions issued"
+  "``SQ_INSTS_VALU_MFMA_F16``", "Instr", "Number of F16 matrix FMA instructions issued"
+  "``SQ_INSTS_VALU_MFMA_F32``", "Instr", "Number of F32 matrix FMA instructions issued"
+  "``SQ_INSTS_VALU_MFMA_F64``", "Instr", "Number of F64 matrix FMA instructions issued"
+  "``SQ_INSTS_MFMA``", "Instr", "Number of matrix FMA instructions issued"
+  "``SQ_INSTS_VMEM_WR``", "Instr", "Number of vector memory write instructions (including flat) issued"
+  "``SQ_INSTS_VMEM_RD``", "Instr", "Number of vector memory read instructions (including flat) issued"
+  "``SQ_INSTS_VMEM``", "Instr", "Number of vector memory instructions issued, including both flat and buffer instructions"
+  "``SQ_INSTS_SALU``", "Instr", "Number of scalar arithmetic logic unit (SALU) instructions issued"
+  "``SQ_INSTS_SMEM``", "Instr", "Number of scalar memory instructions issued"
+  "``SQ_INSTS_SMEM_NORM``", "Instr", "Number of scalar memory instructions normalized to match ``smem_level`` issued"
+  "``SQ_INSTS_FLAT``", "Instr", "Number of flat instructions issued"
+  "``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
+  "``SQ_INSTS_LDS``", "Instr", "Number of LDS instructions issued **(MI200: includes flat; MI300: does not include flat)**"
+  "``SQ_INSTS_GDS``", "Instr", "Number of global data share instructions issued"
+  "``SQ_INSTS_EXP_GDS``", "Instr", "Number of EXP and global data share instructions excluding skipped export instructions issued"
+  "``SQ_INSTS_BRANCH``", "Instr", "Number of branch instructions issued"
+  "``SQ_INSTS_SENDMSG``", "Instr", "Number of ``SENDMSG`` instructions including ``s_endpgm`` issued"
+  "``SQ_INSTS_VSKIPPED``", "Instr", "Number of vector instructions skipped"
+
+Flat instructions allow read, write, and atomic access to a generic memory address pointer that can
+resolve to any of the following physical memories:
+
+* Global Memory
+* Scratch ("private")
+* LDS ("shared")
+* Invalid - ``MEM_VIOL`` TrapStatus
+
+Matrix fused multiply-add operation counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_INSTS_VALU_MFMA_MOPS_I8``", "IOP", "Number of 8-bit integer matrix FMA ops in the unit of 512"
+  "``SQ_INSTS_VALU_MFMA_MOPS_F16``", "FLOP", "Number of F16 floating matrix FMA ops in the unit of 512"
+  "``SQ_INSTS_VALU_MFMA_MOPS_BF16``", "FLOP", "Number of BF16 floating matrix FMA ops in the unit of 512"
+  "``SQ_INSTS_VALU_MFMA_MOPS_F32``", "FLOP", "Number of F32 floating matrix FMA ops in the unit of 512"
+  "``SQ_INSTS_VALU_MFMA_MOPS_F64``", "FLOP", "Number of F64 floating matrix FMA ops in the unit of 512"
+
+Level counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+  All level counters must be followed by ``SQ_ACCUM_PREV_HIRES`` counter to measure average latency.
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_ACCUM_PREV``", "Count", "Accumulated counter sample value where accumulation takes place once every four cycles"
+  "``SQ_ACCUM_PREV_HIRES``", "Count", "Accumulated counter sample value where accumulation takes place once every cycle"
+  "``SQ_LEVEL_WAVES``", "Waves", "Number of inflight waves"
+  "``SQ_INST_LEVEL_VMEM``", "Instr", "Number of inflight vector memory (including flat) instructions"
+  "``SQ_INST_LEVEL_SMEM``", "Instr", "Number of inflight scalar memory instructions"
+  "``SQ_INST_LEVEL_LDS``", "Instr", "Number of inflight LDS (including flat) instructions"
+  "``SQ_IFETCH_LEVEL``", "Instr", "Number of inflight instruction fetch requests from the cache"
+
+Use the following formulae to calculate latencies:
+
+* Vector memory latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_VMEM``
+* Wave latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_WAVE``
+* LDS latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_LDS``
+* Scalar memory latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_INSTS_SMEM_NORM``
+* Instruction fetch latency = ``SQ_ACCUM_PREV_HIRES`` divided by ``SQ_IFETCH``
+
+Wavefront counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_WAVES``", "Waves", "Number of wavefronts dispatched to sequencers, including both new and restored wavefronts"
+  "``SQ_WAVES_SAVED``", "Waves", "Number of context-saved waves"
+  "``SQ_WAVES_RESTORED``", "Waves", "Number of context-restored waves sent to sequencers"
+  "``SQ_WAVES_EQ_64``", "Waves", "Number of wavefronts with exactly 64 active threads sent to sequencers"
+  "``SQ_WAVES_LT_64``", "Waves", "Number of wavefronts with less than 64 active threads sent to sequencers"
+  "``SQ_WAVES_LT_48``", "Waves", "Number of wavefronts with less than 48 active threads sent to sequencers"
+  "``SQ_WAVES_LT_32``", "Waves", "Number of wavefronts with less than 32 active threads sent to sequencers"
+  "``SQ_WAVES_LT_16``", "Waves", "Number of wavefronts with less than 16 active threads sent to sequencers"
+
+Wavefront cycle counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_CYCLES``", "Cycles", "Clock cycles"
+  "``SQ_BUSY_CYCLES``", "Cycles", "Number of cycles while sequencers reports it to be busy"
+  "``SQ_BUSY_CU_CYCLES``", "Qcycles", "Number of quad-cycles each compute unit is busy"
+  "``SQ_VALU_MFMA_BUSY_CYCLES``", "Cycles", "Number of cycles the matrix FMA arithmetic logic unit (ALU) is busy"
+  "``SQ_WAVE_CYCLES``", "Qcycles", "Number of quad-cycles spent by waves in the compute units"
+  "``SQ_WAIT_ANY``", "Qcycles", "Number of quad-cycles spent waiting for anything"
+  "``SQ_WAIT_INST_ANY``", "Qcycles", "Number of quad-cycles spent waiting for any instruction to be issued"
+  "``SQ_ACTIVE_INST_ANY``", "Qcycles", "Number of quad-cycles spent by each wave to work on an instruction"
+  "``SQ_ACTIVE_INST_VMEM``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a vector memory instruction"
+  "``SQ_ACTIVE_INST_LDS``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on an LDS instruction"
+  "``SQ_ACTIVE_INST_VALU``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a VALU instruction"
+  "``SQ_ACTIVE_INST_SCA``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a SALU or scalar memory instruction"
+  "``SQ_ACTIVE_INST_EXP_GDS``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on an ``EXPORT`` or ``GDS`` instruction"
+  "``SQ_ACTIVE_INST_MISC``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a ``BRANCH`` or ``SENDMSG`` instruction"
+  "``SQ_ACTIVE_INST_FLAT``", "Qcycles", "Number of quad-cycles spent by the sequencer instruction arbiter to work on a flat instruction"
+  "``SQ_INST_CYCLES_VMEM_WR``", "Qcycles", "Number of quad-cycles spent to send addr and cmd data for vector memory write instructions"
+  "``SQ_INST_CYCLES_VMEM_RD``", "Qcycles", "Number of quad-cycles spent to send addr and cmd data for vector memory read instructions"
+  "``SQ_INST_CYCLES_SMEM``", "Qcycles", "Number of quad-cycles spent to execute scalar memory reads"
+  "``SQ_INST_CYCLES_SALU``", "Qcycles", "Number of quad-cycles spent to execute non-memory read scalar operations"
+  "``SQ_THREAD_CYCLES_VALU``", "Qcycles", "Number of quad-cycles spent to execute VALU operations on active threads"
+  "``SQ_WAIT_INST_LDS``", "Qcycles", "Number of quad-cycles spent waiting for LDS instruction to be issued"
+
+``SQ_THREAD_CYCLES_VALU`` is similar to ``INST_CYCLES_VALU``, but it's multiplied by the number of
+active threads.
+
+LDS counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_LDS_ATOMIC_RETURN``", "Cycles", "Number of atomic return cycles in LDS"
+  "``SQ_LDS_BANK_CONFLICT``", "Cycles", "Number of cycles LDS is stalled by bank conflicts"
+  "``SQ_LDS_ADDR_CONFLICT``", "Cycles", "Number of cycles LDS is stalled by address conflicts"
+  "``SQ_LDS_UNALIGNED_STALL``", "Cycles", "Number of cycles LDS is stalled processing flat unaligned load or store operations"
+  "``SQ_LDS_MEM_VIOLATIONS``", "Count", "Number of threads that have a memory violation in the LDS"
+  "``SQ_LDS_IDX_ACTIVE``", "Cycles", "Number of cycles LDS is used for indexed operations"
+
+Miscellaneous counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQ_IFETCH``", "Count", "Number of instruction fetch requests from L1i, in 32-byte width"
+  "``SQ_ITEMS``", "Threads", "Number of valid items per wave"
+
+.. _l1i-and-sl1d-cache-counters:
+
+L1 instruction cache (L1i) and scalar L1 data cache (L1d) counters
+---------------------------------------------------------------------------------------------------------------
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition"
+
+  "``SQC_ICACHE_REQ``", "Req", "Number of L1 instruction (L1i) cache requests"
+  "``SQC_ICACHE_HITS``", "Count", "Number of L1i cache hits"
+  "``SQC_ICACHE_MISSES``", "Count", "Number of non-duplicate L1i cache misses including uncached requests"
+  "``SQC_ICACHE_MISSES_DUPLICATE``", "Count", "Number of duplicate L1i cache misses whose previous lookup miss on the same cache line is not fulfilled yet"
+  "``SQC_DCACHE_REQ``", "Req", "Number of scalar L1d requests"
+  "``SQC_DCACHE_INPUT_VALID_READYB``", "Cycles", "Number of cycles while sequencer input is valid but scalar L1d is not ready"
+  "``SQC_DCACHE_HITS``", "Count", "Number of scalar L1d hits"
+  "``SQC_DCACHE_MISSES``", "Count", "Number of non-duplicate scalar L1d misses including uncached requests"
+  "``SQC_DCACHE_MISSES_DUPLICATE``", "Count", "Number of duplicate scalar L1d misses"
+  "``SQC_DCACHE_REQ_READ_1``", "Req", "Number of constant cache read requests in a single 32-bit data word"
+  "``SQC_DCACHE_REQ_READ_2``", "Req", "Number of constant cache read requests in two 32-bit data words"
+  "``SQC_DCACHE_REQ_READ_4``", "Req", "Number of constant cache read requests in four 32-bit data words"
+  "``SQC_DCACHE_REQ_READ_8``", "Req", "Number of constant cache read requests in eight 32-bit data words"
+  "``SQC_DCACHE_REQ_READ_16``", "Req", "Number of constant cache read requests in 16 32-bit data words"
+  "``SQC_DCACHE_ATOMIC``", "Req", "Number of atomic requests"
+  "``SQC_TC_REQ``", "Req", "Number of texture cache requests that were issued by instruction and constant caches"
+  "``SQC_TC_INST_REQ``", "Req", "Number of instruction requests to the L2 cache"
+  "``SQC_TC_DATA_READ_REQ``", "Req", "Number of data Read requests to the L2 cache"
+  "``SQC_TC_DATA_WRITE_REQ``", "Req", "Number of data write requests to the L2 cache"
+  "``SQC_TC_DATA_ATOMIC_REQ``", "Req", "Number of data atomic requests to the L2 cache"
+  "``SQC_TC_STALL``", "Cycles", "Number of cycles while the valid requests to the L2 cache are stalled"
+
+.. _vector-l1-cache-subsystem-counters:
+
+Vector L1 cache subsystem counters
+---------------------------------------------------------------------------------------------------------------
+
+The vector L1 cache subsystem counters are further classified into texture addressing unit, texture data
+unit, vector L1d or texture cache per pipe, and texture cache arbiter counters.
+
+Texture addressing unit counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+  "``TA_TA_BUSY[n]``", "Cycles", "Texture addressing unit busy cycles", "0-15"
+  "``TA_TOTAL_WAVEFRONTS[n]``", "Instr", "Number of wavefronts processed by texture addressing unit", "0-15"
+  "``TA_BUFFER_WAVEFRONTS[n]``", "Instr", "Number of buffer wavefronts processed by texture addressing unit", "0-15"
+  "``TA_BUFFER_READ_WAVEFRONTS[n]``", "Instr", "Number of buffer read wavefronts processed by texture addressing unit", "0-15"
+  "``TA_BUFFER_WRITE_WAVEFRONTS[n]``", "Instr", "Number of buffer write wavefronts processed by texture addressing unit", "0-15"
+  "``TA_BUFFER_ATOMIC_WAVEFRONTS[n]``", "Instr", "Number of buffer atomic wavefronts processed by texture addressing unit", "0-15"
+  "``TA_BUFFER_TOTAL_CYCLES[n]``", "Cycles", "Number of buffer cycles (including read and write) issued to texture cache", "0-15"
+  "``TA_BUFFER_COALESCED_READ_CYCLES[n]``", "Cycles", "Number of coalesced buffer read cycles issued to texture cache", "0-15"
+  "``TA_BUFFER_COALESCED_WRITE_CYCLES[n]``", "Cycles", "Number of coalesced buffer write cycles issued to texture cache", "0-15"
+  "``TA_ADDR_STALLED_BY_TC_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit address path is stalled by texture cache", "0-15"
+  "``TA_DATA_STALLED_BY_TC_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit data path is stalled by texture cache", "0-15"
+  "``TA_ADDR_STALLED_BY_TD_CYCLES[n]``", "Cycles", "Number of cycles texture addressing unit address path is stalled by texture data unit", "0-15"
+  "``TA_FLAT_WAVEFRONTS[n]``", "Instr", "Number of flat opcode wavefronts processed by texture addressing unit", "0-15"
+  "``TA_FLAT_READ_WAVEFRONTS[n]``", "Instr", "Number of flat opcode read wavefronts processed by texture addressing unit", "0-15"
+  "``TA_FLAT_WRITE_WAVEFRONTS[n]``", "Instr", "Number of flat opcode write wavefronts processed by texture addressing unit", "0-15"
+  "``TA_FLAT_ATOMIC_WAVEFRONTS[n]``", "Instr", "Number of flat opcode atomic wavefronts processed by texture addressing unit", "0-15"
+
+Texture data unit counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+  "``TD_TD_BUSY[n]``", "Cycle", "Texture data unit busy cycles while it is processing or waiting for data", "0-15"
+  "``TD_TC_STALL[n]``", "Cycle", "Number of cycles texture data unit is stalled waiting for texture cache data", "0-15"
+  "``TD_SPI_STALL[n]``", "Cycle", "Number of cycles texture data unit is stalled by shader processor input", "0-15"
+  "``TD_LOAD_WAVEFRONT[n]``", "Instr", "Number of wavefront instructions (read, write, atomic)", "0-15"
+  "``TD_STORE_WAVEFRONT[n]``", "Instr", "Number of write wavefront instructions", "0-15"
+  "``TD_ATOMIC_WAVEFRONT[n]``", "Instr", "Number of atomic wavefront instructions", "0-15"
+  "``TD_COALESCABLE_WAVEFRONT[n]``", "Instr", "Number of coalescable wavefronts according to texture addressing unit", "0-15"
+
+Texture cache per pipe counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+  "``TCP_GATE_EN1[n]``", "Cycles", "Number of cycles vector L1d interface clocks are turned on", "0-15"
+  "``TCP_GATE_EN2[n]``", "Cycles", "Number of cycles vector L1d core clocks are turned on", "0-15"
+  "``TCP_TD_TCP_STALL_CYCLES[n]``", "Cycles", "Number of cycles texture data unit stalls vector L1d", "0-15"
+  "``TCP_TCR_TCP_STALL_CYCLES[n]``", "Cycles", "Number of cycles texture cache router stalls vector L1d", "0-15"
+  "``TCP_READ_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on a read", "0-15"
+  "``TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on a write", "0-15"
+  "``TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]``", "Cycles", "Number of cycles tag RAM conflict stalls on an atomic", "0-15"
+  "``TCP_PENDING_STALL_CYCLES[n]``", "Cycles", "Number of cycles vector L1d is stalled due to data pending from L2 Cache", "0-15"
+  "``TCP_TCP_TA_DATA_STALL_CYCLES``", "Cycles", "Number of cycles texture cache per pipe stalls texture addressing unit data interface", "NA"
+  "``TCP_TA_TCP_STATE_READ[n]``", "Req", "Number of state reads", "0-15"
+  "``TCP_VOLATILE[n]``", "Req", "Number of L1 volatile pixels or buffers from texture addressing unit", "0-15"
+  "``TCP_TOTAL_ACCESSES[n]``", "Req", "Number of vector L1d accesses. Equals ``TCP_PERF_SEL_TOTAL_READ`+`TCP_PERF_SEL_TOTAL_NONREAD``", "0-15"
+  "``TCP_TOTAL_READ[n]``", "Req", "Number of vector L1d read accesses", "0-15"
+  "``TCP_TOTAL_WRITE[n]``", "Req", "Number of vector L1d write accesses", "0-15"
+  "``TCP_TOTAL_ATOMIC_WITH_RET[n]``", "Req", "Number of vector L1d atomic requests with return", "0-15"
+  "``TCP_TOTAL_ATOMIC_WITHOUT_RET[n]``", "Req", "Number of vector L1d atomic without return", "0-15"
+  "``TCP_TOTAL_WRITEBACK_INVALIDATES[n]``", "Count", "Total number of vector L1d writebacks and invalidates", "0-15"
+  "``TCP_UTCL1_REQUEST[n]``", "Req", "Number of address translation requests to unified translation cache (L1)", "0-15"
+  "``TCP_UTCL1_TRANSLATION_HIT[n]``", "Req", "Number of unified translation cache (L1) translation hits", "0-15"
+  "``TCP_UTCL1_TRANSLATION_MISS[n]``", "Req", "Number of unified translation cache (L1) translation misses", "0-15"
+  "``TCP_UTCL1_PERMISSION_MISS[n]``", "Req", "Number of unified translation cache (L1) permission misses", "0-15"
+  "``TCP_TOTAL_CACHE_ACCESSES[n]``", "Req", "Number of vector L1d cache accesses including hits and misses", "0-15"
+  "``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
+  "``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
+  "``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
+  "``TCP_TCC_READ_REQ[n]``", "Req", "Number of read requests to L2 cache", "0-15"
+  "``TCP_TCC_WRITE_REQ[n]``", "Req", "Number of write requests to L2 cache", "0-15"
+  "``TCP_TCC_ATOMIC_WITH_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache with return", "0-15"
+  "``TCP_TCC_ATOMIC_WITHOUT_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache without return", "0-15"
+  "``TCP_TCC_NC_READ_REQ[n]``", "Req", "Number of non-coherently cached read requests to L2 cache", "0-15"
+  "``TCP_TCC_UC_READ_REQ[n]``", "Req", "Number of uncached read requests to L2 cache", "0-15"
+  "``TCP_TCC_CC_READ_REQ[n]``", "Req", "Number of coherently cached read requests to L2 cache", "0-15"
+  "``TCP_TCC_RW_READ_REQ[n]``", "Req", "Number of coherently cached with write read requests to L2 cache", "0-15"
+  "``TCP_TCC_NC_WRITE_REQ[n]``", "Req", "Number of non-coherently cached write requests to L2 cache", "0-15"
+  "``TCP_TCC_UC_WRITE_REQ[n]``", "Req", "Number of uncached write requests to L2 cache", "0-15"
+  "``TCP_TCC_CC_WRITE_REQ[n]``", "Req", "Number of coherently cached write requests to L2 cache", "0-15"
+  "``TCP_TCC_RW_WRITE_REQ[n]``", "Req", "Number of coherently cached with write write requests to L2 cache", "0-15"
+  "``TCP_TCC_NC_ATOMIC_REQ[n]``", "Req", "Number of non-coherently cached atomic requests to L2 cache", "0-15"
+  "``TCP_TCC_UC_ATOMIC_REQ[n]``", "Req", "Number of uncached atomic requests to L2 cache", "0-15"
+  "``TCP_TCC_CC_ATOMIC_REQ[n]``", "Req", "Number of coherently cached atomic requests to L2 cache", "0-15"
+  "``TCP_TCC_RW_ATOMIC_REQ[n]``", "Req", "Number of coherently cached with write atomic requests to L2 cache", "0-15"
+
+Note that:
+
+* ``TCP_TOTAL_READ[n]`` = ``TCP_PERF_SEL_TOTAL_HIT_LRU_READ`` + ``TCP_PERF_SEL_TOTAL_MISS_LRU_READ`` + ``TCP_PERF_SEL_TOTAL_MISS_EVICT_READ``
+* ``TCP_TOTAL_WRITE[n]`` = ``TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE``+ ``TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE``
+* ``TCP_TOTAL_WRITEBACK_INVALIDATES[n]`` = ``TCP_PERF_SEL_TOTAL_WBINVL1``+ ``TCP_PERF_SEL_TOTAL_WBINVL1_VOL``+ ``TCP_PERF_SEL_CP_TCP_INVALIDATE``+ ``TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL``
+
+Texture cache arbiter counters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. csv-table::
+  :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+  "``TCA_CYCLE[n]``", "Cycles", "Number of texture cache arbiter cycles", "0-31"
+  "``TCA_BUSY[n]``", "Cycles", "Number of cycles texture cache arbiter has a pending request", "0-31"
+
+.. _l2-cache-access-counters:
+
+L2 cache access counters
+---------------------------------------------------------------------------------------------------------------
+
+L2 cache is also known as texture cache per channel.
+
+.. tab-set::
+
+    .. tab-item:: MI300 hardware counter
+
+      .. csv-table::
+        :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+        "``TCC_CYCLE[n]``", "Cycles", "Number of L2 cache free-running clocks", "0-31"
+        "``TCC_BUSY[n]``", "Cycles", "Number of L2 cache busy cycles", "0-31"
+        "``TCC_REQ[n]``", "Req", "Number of L2 cache requests of all types (measured at the tag block)", "0-31"
+        "``TCC_STREAMING_REQ[n]``", "Req", "Number of L2 cache streaming requests (measured at the tag block)", "0-31"
+        "``TCC_NC_REQ[n]``", "Req", "Number of non-coherently cached requests (measured at the tag block)", "0-31"
+        "``TCC_UC_REQ[n]``", "Req", "Number of uncached requests. This is measured at the tag block", "0-31"
+        "``TCC_CC_REQ[n]``", "Req", "Number of coherently cached requests. This is measured at the tag block", "0-31"
+        "``TCC_RW_REQ[n]``", "Req", "Number of coherently cached with write requests. This is measured at the tag block", "0-31"
+        "``TCC_PROBE[n]``", "Req", "Number of probe requests", "0-31"
+        "``TCC_PROBE_ALL[n]``", "Req", "Number of external probe requests with ``EA_TCC_preq_all == 1``", "0-31"
+        "``TCC_READ[n]``", "Req", "Number of L2 cache read requests (includes compressed reads but not metadata reads)", "0-31"
+        "``TCC_WRITE[n]``", "Req", "Number of L2 cache write requests", "0-31"
+        "``TCC_ATOMIC[n]``", "Req", "Number of L2 cache atomic requests of all types", "0-31"
+        "``TCC_HIT[n]``", "Req", "Number of L2 cache hits", "0-31"
+        "``TCC_MISS[n]``", "Req", "Number of L2 cache misses", "0-31"
+        "``TCC_WRITEBACK[n]``", "Req", "Number of lines written back to the main memory, including writebacks of dirty lines and uncached write or atomic requests", "0-31"
+        "``TCC_EA0_WRREQ[n]``", "Req", "Number of 32-byte and 64-byte transactions going over the ``TC_EA_wrreq`` interface (doesn't include probe commands)", "0-31"
+        "``TCC_EA0_WRREQ_64B[n]``", "Req", "Total number of 64-byte transactions (write or ``CMPSWAP``) going over the ``TC_EA_wrreq`` interface", "0-31"
+        "``TCC_EA0_WR_UNCACHED_32B[n]``", "Req", "Number of 32 or 64-byte write or atomic going over the ``TC_EA_wrreq`` interface due to uncached traffic", "0-31"
+        "``TCC_EA0_WRREQ_STALL[n]``", "Cycles", "Number of cycles a write request is stalled", "0-31"
+        "``TCC_EA0_WRREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of input-output (IO) credits", "0-31"
+        "``TCC_EA0_WRREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits", "0-31"
+        "``TCC_EA0_WRREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits", "0-31"
+        "``TCC_TOO_MANY_EA_WRREQS_STALL[n]``", "Cycles", "Number of cycles the L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests", "0-31"
+        "``TCC_EA0_WRREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter write requests in flight", "0-31"
+        "``TCC_EA0_ATOMIC[n]``", "Req", "Number of 32-byte or 64-byte atomic requests going over the ``TC_EA_wrreq`` interface", "0-31"
+        "``TCC_EA0_ATOMIC_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter atomic requests in flight", "0-31"
+        "``TCC_EA0_RDREQ[n]``", "Req", "Number of 32-byte or 64-byte read requests to efficiency arbiter", "0-31"
+        "``TCC_EA0_RDREQ_32B[n]``", "Req", "Number of 32-byte read requests to efficiency arbiter", "0-31"
+        "``TCC_EA0_RD_UNCACHED_32B[n]``", "Req", "Number of 32-byte efficiency arbiter reads due to uncached traffic. A 64-byte request is counted as 2", "0-31"
+        "``TCC_EA0_RDREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of IO credits", "0-31"
+        "``TCC_EA0_RDREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of GMI credits", "0-31"
+        "``TCC_EA0_RDREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of DRAM credits", "0-31"
+        "``TCC_EA0_RDREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter read requests in flight", "0-31"
+        "``TCC_EA0_RDREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter read requests to High Bandwidth Memory (HBM)", "0-31"
+        "``TCC_EA0_WRREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter write requests to HBM", "0-31"
+        "``TCC_TAG_STALL[n]``", "Cycles", "Number of cycles the normal request pipeline in the tag is stalled for any reason", "0-31"
+        "``TCC_NORMAL_WRITEBACK[n]``", "Req", "Number of writebacks due to requests that are not writeback requests", "0-31"
+        "``TCC_ALL_TC_OP_WB_WRITEBACK[n]``", "Req", "Number of writebacks due to all ``TC_OP`` writeback requests", "0-31"
+        "``TCC_NORMAL_EVICT[n]``", "Req", "Number of evictions due to requests that are not invalidate or probe requests", "0-31"
+        "``TCC_ALL_TC_OP_INV_EVICT[n]``", "Req", "Number of evictions due to all ``TC_OP`` invalidate requests", "0-31"
+
+    .. tab-item:: MI200 hardware counter
+
+      .. csv-table::
+        :header: "Hardware counter", "Unit", "Definition", "Value range for ``n``"
+
+        "``TCC_CYCLE[n]``", "Cycles", "Number of L2 cache free-running clocks", "0-31"
+        "``TCC_BUSY[n]``", "Cycles", "Number of L2 cache busy cycles", "0-31"
+        "``TCC_REQ[n]``", "Req", "Number of L2 cache requests of all types (measured at the tag block)", "0-31"
+        "``TCC_STREAMING_REQ[n]``", "Req", "Number of L2 cache streaming requests (measured at the tag block)", "0-31"
+        "``TCC_NC_REQ[n]``", "Req", "Number of non-coherently cached requests (measured at the tag block)", "0-31"
+        "``TCC_UC_REQ[n]``", "Req", "Number of uncached requests. This is measured at the tag block", "0-31"
+        "``TCC_CC_REQ[n]``", "Req", "Number of coherently cached requests. This is measured at the tag block", "0-31"
+        "``TCC_RW_REQ[n]``", "Req", "Number of coherently cached with write requests. This is measured at the tag block", "0-31"
+        "``TCC_PROBE[n]``", "Req", "Number of probe requests", "0-31"
+        "``TCC_PROBE_ALL[n]``", "Req", "Number of external probe requests with ``EA_TCC_preq_all == 1``", "0-31"
+        "``TCC_READ[n]``", "Req", "Number of L2 cache read requests (includes compressed reads but not metadata reads)", "0-31"
+        "``TCC_WRITE[n]``", "Req", "Number of L2 cache write requests", "0-31"
+        "``TCC_ATOMIC[n]``", "Req", "Number of L2 cache atomic requests of all types", "0-31"
+        "``TCC_HIT[n]``", "Req", "Number of L2 cache hits", "0-31"
+        "``TCC_MISS[n]``", "Req", "Number of L2 cache misses", "0-31"
+        "``TCC_WRITEBACK[n]``", "Req", "Number of lines written back to the main memory, including writebacks of dirty lines and uncached write or atomic requests", "0-31"
+        "``TCC_EA_WRREQ[n]``", "Req", "Number of 32-byte and 64-byte transactions going over the ``TC_EA_wrreq`` interface (doesn't include probe commands)", "0-31"
+        "``TCC_EA_WRREQ_64B[n]``", "Req", "Total number of 64-byte transactions (write or ``CMPSWAP``) going over the ``TC_EA_wrreq`` interface", "0-31"
+        "``TCC_EA_WR_UNCACHED_32B[n]``", "Req", "Number of 32 write or atomic going over the ``TC_EA_wrreq`` interface due to uncached traffic. A 64-byte request will be counted as 2", "0-31"
+        "``TCC_EA_WRREQ_STALL[n]``", "Cycles", "Number of cycles a write request is stalled", "0-31"
+        "``TCC_EA_WRREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of input-output (IO) credits", "0-31"
+        "``TCC_EA_WRREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits", "0-31"
+        "``TCC_EA_WRREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits", "0-31"
+        "``TCC_TOO_MANY_EA_WRREQS_STALL[n]``", "Cycles", "Number of cycles the L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests", "0-31"
+        "``TCC_EA_WRREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter write requests in flight", "0-31"
+        "``TCC_EA_ATOMIC[n]``", "Req", "Number of 32-byte or 64-byte atomic requests going over the ``TC_EA_wrreq`` interface", "0-31"
+        "``TCC_EA_ATOMIC_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter atomic requests in flight", "0-31"
+        "``TCC_EA_RDREQ[n]``", "Req", "Number of 32-byte or 64-byte read requests to efficiency arbiter", "0-31"
+        "``TCC_EA_RDREQ_32B[n]``", "Req", "Number of 32-byte read requests to efficiency arbiter", "0-31"
+        "``TCC_EA_RD_UNCACHED_32B[n]``", "Req", "Number of 32-byte efficiency arbiter reads due to uncached traffic. A 64-byte request is counted as 2", "0-31"
+        "``TCC_EA_RDREQ_IO_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of IO credits", "0-31"
+        "``TCC_EA_RDREQ_GMI_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of GMI credits", "0-31"
+        "``TCC_EA_RDREQ_DRAM_CREDIT_STALL[n]``", "Cycles", "Number of cycles there is a stall due to the read request interface running out of DRAM credits", "0-31"
+        "``TCC_EA_RDREQ_LEVEL[n]``", "Req", "The accumulated number of efficiency arbiter read requests in flight", "0-31"
+        "``TCC_EA_RDREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter read requests to High Bandwidth Memory (HBM)", "0-31"
+        "``TCC_EA_WRREQ_DRAM[n]``", "Req", "Number of 32-byte or 64-byte efficiency arbiter write requests to HBM", "0-31"
+        "``TCC_TAG_STALL[n]``", "Cycles", "Number of cycles the normal request pipeline in the tag is stalled for any reason", "0-31"
+        "``TCC_NORMAL_WRITEBACK[n]``", "Req", "Number of writebacks due to requests that are not writeback requests", "0-31"
+        "``TCC_ALL_TC_OP_WB_WRITEBACK[n]``", "Req", "Number of writebacks due to all ``TC_OP`` writeback requests", "0-31"
+        "``TCC_NORMAL_EVICT[n]``", "Req", "Number of evictions due to requests that are not invalidate or probe requests", "0-31"
+        "``TCC_ALL_TC_OP_INV_EVICT[n]``", "Req", "Number of evictions due to all ``TC_OP`` invalidate requests", "0-31"
+
+Note the following:
+
+* ``TCC_REQ[n]`` may be more than the number of requests arriving at the texture cache per channel,
+  but it's a good indication of the total amount of work that needs to be performed.
+* For ``TCC_EA0_WRREQ[n]``, atomics may travel over the same interface and are generally classified as
+  write requests.
+* CC mtypes can produce uncached requests, and those are included in
+  ``TCC_EA0_WR_UNCACHED_32B[n]``
+* ``TCC_EA0_WRREQ_LEVEL[n]`` is primarily intended to measure average efficiency arbiter write latency.
+
+  * Average write latency = ``TCC_PERF_SEL_EA0_WRREQ_LEVEL`` divided by ``TCC_PERF_SEL_EA0_WRREQ``
+
+* ``TCC_EA0_ATOMIC_LEVEL[n]`` is primarily intended to measure average efficiency arbiter atomic
+  latency
+
+  * Average atomic latency = ``TCC_PERF_SEL_EA0_WRREQ_ATOMIC_LEVEL`` divided by ``TCC_PERF_SEL_EA0_WRREQ_ATOMIC``
+
+* ``TCC_EA0_RDREQ_LEVEL[n]`` is primarily intended to measure average efficiency arbiter read latency.
+
+  * Average read latency = ``TCC_PERF_SEL_EA0_RDREQ_LEVEL`` divided by ``TCC_PERF_SEL_EA0_RDREQ``
+
+* Stalls can occur regardless of the need for a read to be performed
+* Normally, stalls are measured exactly at one point in the pipeline however in the case of
+  ``TCC_TAG_STALL[n]``, probes can stall the pipeline at a variety of places. There is no single point that
+  can accurately measure the total stalls
+
+MI300 and MI200 series derived metrics list
+==============================================================
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``ALUStalledByLDS``", "Percentage of GPU time ALU units are stalled due to the LDS input queue being full or the output queue not being ready (value range: 0% (optimal) to 100%)"
+  "``FetchSize``", "Total kilobytes fetched from the video memory; measured with all extra fetches and any cache or memory effects taken into account"
+  "``FlatLDSInsts``", "Average number of flat instructions that read from or write to LDS, run per work item (affected by flow control)"
+  "``FlatVMemInsts``", "Average number of flat instructions that read from or write to the video memory, run per work item (affected by flow control). Includes flat instructions that read from or write to scratch"
+  "``GDSInsts``", "Average number of global data share read or write instructions run per work item (affected by flow control)"
+  "``GPUBusy``", "Percentage of time GPU is busy"
+  "``L2CacheHit``", "Percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache (value range: 0% (no hit) to 100% (optimal))"
+  "``LDSBankConflict``", "Percentage of GPU time LDS is stalled by bank conflicts (value range: 0% (optimal) to 100%)"
+  "``LDSInsts``", "Average number of LDS read or write instructions run per work item (affected by flow control). Excludes flat instructions that read from or write to LDS."
+  "``MemUnitBusy``", "Percentage of GPU time the memory unit is active, which is measured with all extra fetches and writes and any cache or memory effects taken into account (value range: 0% to 100% (fetch-bound))"
+  "``MemUnitStalled``", "Percentage of GPU time the memory unit is stalled (value range: 0% (optimal) to 100%)"
+  "``MemWrites32B``", "Total number of effective 32B write transactions to the memory"
+  "``TCA_BUSY_sum``", "Total number of cycles texture cache arbiter has a pending request, over all texture cache arbiter instances"
+  "``TCA_CYCLE_sum``", "Total number of cycles over all texture cache arbiter instances"
+  "``SALUBusy``", "Percentage of GPU time scalar ALU instructions are processed (value range: 0% to 100% (optimal))"
+  "``SALUInsts``", "Average number of scalar ALU instructions run per work item (affected by flow control)"
+  "``SFetchInsts``", "Average number of scalar fetch instructions from the video memory run per work item (affected by flow control)"
+  "``VALUBusy``", "Percentage of GPU time vector ALU instructions are processed (value range: 0% to 100% (optimal))"
+  "``VALUInsts``", "Average number of vector ALU instructions run per work item (affected by flow control)"
+  "``VALUUtilization``", "Percentage of active vector ALU threads in a wave, where a lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64 (value range: 0%, 100% (optimal - no thread divergence))"
+  "``VFetchInsts``", "Average number of vector fetch instructions from the video memory run per work-item (affected by flow control); excludes flat instructions that fetch from video memory"
+  "``VWriteInsts``", "Average number of vector write instructions to the video memory run per work-item (affected by flow control); excludes flat instructions that write to video memory"
+  "``Wavefronts``", "Total wavefronts"
+  "``WRITE_REQ_32B``", "Total number of 32-byte effective memory writes"
+  "``WriteSize``", "Total kilobytes written to the video memory; measured with all extra fetches and any cache or memory effects taken into account"
+  "``WriteUnitStalled``", "Percentage of GPU time the write unit is stalled (value range: 0% (optimal) to 100%)"
+
+You can lower ``ALUStalledByLDS`` by reducing LDS bank conflicts or number of LDS accesses.
+You can lower ``MemUnitStalled`` by reducing the number or size of fetches and writes.
+``MemUnitBusy`` includes the stall time (``MemUnitStalled``).
+
+Hardware counters by and over all texture addressing unit instances
+---------------------------------------------------------------------------------------------------------------
+
+The following table shows the hardware counters *by* all texture addressing unit instances.
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TA_BUFFER_WAVEFRONTS_sum``", "Total number of buffer wavefronts processed"
+  "``TA_BUFFER_READ_WAVEFRONTS_sum``", "Total number of buffer read wavefronts processed"
+  "``TA_BUFFER_WRITE_WAVEFRONTS_sum``", "Total number of buffer write wavefronts processed"
+  "``TA_BUFFER_ATOMIC_WAVEFRONTS_sum``", "Total number of buffer atomic wavefronts processed"
+  "``TA_BUFFER_TOTAL_CYCLES_sum``", "Total number of buffer cycles (including read and write) issued to texture cache"
+  "``TA_BUFFER_COALESCED_READ_CYCLES_sum``", "Total number of coalesced buffer read cycles issued to texture cache"
+  "``TA_BUFFER_COALESCED_WRITE_CYCLES_sum``", "Total number of coalesced buffer write cycles issued to texture cache"
+  "``TA_FLAT_READ_WAVEFRONTS_sum``", "Sum of flat opcode reads processed"
+  "``TA_FLAT_WRITE_WAVEFRONTS_sum``", "Sum of flat opcode writes processed"
+  "``TA_FLAT_WAVEFRONTS_sum``", "Total number of flat opcode wavefronts processed"
+  "``TA_FLAT_READ_WAVEFRONTS_sum``", "Total number of flat opcode read wavefronts processed"
+  "``TA_FLAT_ATOMIC_WAVEFRONTS_sum``", "Total number of flat opcode atomic wavefronts processed"
+  "``TA_TOTAL_WAVEFRONTS_sum``", "Total number of wavefronts processed"
+
+The following table shows the hardware counters *over* all texture addressing unit instances.
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TA_ADDR_STALLED_BY_TC_CYCLES_sum``", "Total number of cycles texture addressing unit address path is stalled by texture cache"
+  "``TA_ADDR_STALLED_BY_TD_CYCLES_sum``", "Total number of cycles texture addressing unit address path is stalled by texture data unit"
+  "``TA_BUSY_avr``", "Average number of busy cycles"
+  "``TA_BUSY_max``", "Maximum number of texture addressing unit busy cycles"
+  "``TA_BUSY_min``", "Minimum number of texture addressing unit busy cycles"
+  "``TA_DATA_STALLED_BY_TC_CYCLES_sum``", "Total number of cycles texture addressing unit data path is stalled by texture cache"
+  "``TA_TA_BUSY_sum``", "Total number of texture addressing unit busy cycles"
+
+Hardware counters over all texture cache per channel instances
+---------------------------------------------------------------------------------------------------------------
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TCC_ALL_TC_OP_WB_WRITEBACK_sum``", "Total number of writebacks due to all ``TC_OP`` writeback requests."
+  "``TCC_ALL_TC_OP_INV_EVICT_sum``", "Total number of evictions due to all ``TC_OP`` invalidate requests."
+  "``TCC_ATOMIC_sum``", "Total number of L2 cache atomic requests of all types."
+  "``TCC_BUSY_avr``", "Average number of L2 cache busy cycles."
+  "``TCC_BUSY_sum``", "Total number of L2 cache busy cycles."
+  "``TCC_CC_REQ_sum``", "Total number of coherently cached requests."
+  "``TCC_CYCLE_sum``", "Total number of L2 cache free running clocks."
+  "``TCC_EA0_WRREQ_sum``", "Total number of 32-byte and 64-byte transactions going over the ``TC_EA0_wrreq`` interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."
+  "``TCC_EA0_WRREQ_64B_sum``", "Total number of 64-byte transactions (write or `CMPSWAP`) going over the ``TC_EA0_wrreq`` interface."
+  "``TCC_EA0_WR_UNCACHED_32B_sum``", "Total Number of 32-byte write or atomic going over the ``TC_EA0_wrreq`` interface due to uncached traffic. Note that coherently cached mtypes can produce uncached requests, and those are included in this. A 64-byte request is counted as 2."
+  "``TCC_EA0_WRREQ_STALL_sum``", "Total Number of cycles a write request is stalled, over all instances."
+  "``TCC_EA0_WRREQ_IO_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of IO credits, over all instances."
+  "``TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of GMI credits, over all instances."
+  "``TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum``", "Total number of cycles an efficiency arbiter write request is stalled due to the interface running out of DRAM credits, over all instances."
+  "``TCC_EA0_WRREQ_LEVEL_sum``", "Total number of efficiency arbiter write requests in flight."
+  "``TCC_EA0_RDREQ_LEVEL_sum``", "Total number of efficiency arbiter read requests in flight."
+  "``TCC_EA0_ATOMIC_sum``", "Total Number of 32-byte or 64-byte atomic requests going over the ``TC_EA0_wrreq`` interface."
+  "``TCC_EA0_ATOMIC_LEVEL_sum``", "Total number of efficiency arbiter atomic requests in flight."
+  "``TCC_EA0_RDREQ_sum``", "Total number of 32-byte or 64-byte read requests to efficiency arbiter."
+  "``TCC_EA0_RDREQ_32B_sum``", "Total number of 32-byte read requests to efficiency arbiter."
+  "``TCC_EA0_RD_UNCACHED_32B_sum``", "Total number of 32-byte efficiency arbiter reads due to uncached traffic."
+  "``TCC_EA0_RDREQ_IO_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of IO credits."
+  "``TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of GMI credits."
+  "``TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum``", "Total number of cycles there is a stall due to the read request interface running out of DRAM credits."
+  "``TCC_EA0_RDREQ_DRAM_sum``", "Total number of 32-byte or 64-byte efficiency arbiter read requests to HBM."
+  "``TCC_EA0_WRREQ_DRAM_sum``", "Total number of 32-byte or 64-byte efficiency arbiter write requests to HBM."
+  "``TCC_HIT_sum``", "Total number of L2 cache hits."
+  "``TCC_MISS_sum``", "Total number of L2 cache misses."
+  "``TCC_NC_REQ_sum``", "Total number of non-coherently cached requests."
+  "``TCC_NORMAL_WRITEBACK_sum``", "Total number of writebacks due to requests that are not writeback requests."
+  "``TCC_NORMAL_EVICT_sum``", "Total number of evictions due to requests that are not invalidate or probe requests."
+  "``TCC_PROBE_sum``", "Total number of probe requests."
+  "``TCC_PROBE_ALL_sum``", "Total number of external probe requests with ``EA0_TCC_preq_all == 1``."
+  "``TCC_READ_sum``", "Total number of L2 cache read requests (including compressed reads but not metadata reads)."
+  "``TCC_REQ_sum``", "Total number of all types of L2 cache requests."
+  "``TCC_RW_REQ_sum``", "Total number of coherently cached with write requests."
+  "``TCC_STREAMING_REQ_sum``", "Total number of L2 cache streaming requests."
+  "``TCC_TAG_STALL_sum``", "Total number of cycles the normal request pipeline in the tag is stalled for any reason."
+  "``TCC_TOO_MANY_EA0_WRREQS_STALL_sum``", "Total number of cycles L2 cache is unable to send an efficiency arbiter write request due to it reaching its maximum capacity of pending efficiency arbiter write requests."
+  "``TCC_UC_REQ_sum``", "Total number of uncached requests."
+  "``TCC_WRITE_sum``", "Total number of L2 cache write requests."
+  "``TCC_WRITEBACK_sum``", "Total number of lines written back to the main memory including writebacks of dirty lines and uncached write or atomic requests."
+  "``TCC_WRREQ_STALL_max``", "Maximum number of cycles a write request is stalled."
+
+Hardware counters by, for, or over all texture cache per pipe instances
+----------------------------------------------------------------------------------------------------------------
+
+The following table shows the hardware counters *by* all texture cache per pipe instances.
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TCP_TA_TCP_STATE_READ_sum``", "Total number of state reads by ATCPPI"
+  "``TCP_TOTAL_CACHE_ACCESSES_sum``", "Total number of vector L1d accesses (including hits and misses)"
+  "``TCP_UTCL1_PERMISSION_MISS_sum``", "Total number of unified translation cache (L1) permission misses"
+  "``TCP_UTCL1_REQUEST_sum``", "Total number of address translation requests to unified translation cache (L1)"
+  "``TCP_UTCL1_TRANSLATION_MISS_sum``", "Total number of unified translation cache (L1) translation misses"
+  "``TCP_UTCL1_TRANSLATION_HIT_sum``", "Total number of unified translation cache (L1) translation hits"
+
+The following table shows the hardware counters *for* all texture cache per pipe instances.
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TCP_TCC_READ_REQ_LATENCY_sum``", "Total vector L1d to L2 request latency over all wavefronts for reads and atomics with return"
+  "``TCP_TCC_WRITE_REQ_LATENCY_sum``", "Total vector L1d to L2 request latency over all wavefronts for writes and atomics without return"
+  "``TCP_TCP_LATENCY_sum``", "Total wave access latency to vector L1d over all wavefronts"
+
+The following table shows the hardware counters *over* all texture cache per pipe instances.
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on an atomic"
+  "``TCP_GATE_EN1_sum``", "Total number of cycles vector L1d interface clocks are turned on"
+  "``TCP_GATE_EN2_sum``", "Total number of cycles vector L1d core clocks are turned on"
+  "``TCP_PENDING_STALL_CYCLES_sum``", "Total number of cycles vector L1d cache is stalled due to data pending from L2 Cache"
+  "``TCP_READ_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on a read"
+  "``TCP_TCC_ATOMIC_WITH_RET_REQ_sum``", "Total number of atomic requests to L2 cache with return"
+  "``TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum``", "Total number of atomic requests to L2 cache without return"
+  "``TCP_TCC_CC_READ_REQ_sum``", "Total number of coherently cached read requests to L2 cache"
+  "``TCP_TCC_CC_WRITE_REQ_sum``", "Total number of coherently cached write requests to L2 cache"
+  "``TCP_TCC_CC_ATOMIC_REQ_sum``", "Total number of coherently cached atomic requests to L2 cache"
+  "``TCP_TCC_NC_READ_REQ_sum``", "Total number of non-coherently cached read requests to L2 cache"
+  "``TCP_TCC_NC_WRITE_REQ_sum``", "Total number of non-coherently cached write requests to L2 cache"
+  "``TCP_TCC_NC_ATOMIC_REQ_sum``", "Total number of non-coherently cached atomic requests to L2 cache"
+  "``TCP_TCC_READ_REQ_sum``", "Total number of read requests to L2 cache"
+  "``TCP_TCC_RW_READ_REQ_sum``", "Total number of coherently cached with write read requests to L2 cache"
+  "``TCP_TCC_RW_WRITE_REQ_sum``", "Total number of coherently cached with write write requests to L2 cache"
+  "``TCP_TCC_RW_ATOMIC_REQ_sum``", "Total number of coherently cached with write atomic requests to L2 cache"
+  "``TCP_TCC_UC_READ_REQ_sum``", "Total number of uncached read requests to L2 cache"
+  "``TCP_TCC_UC_WRITE_REQ_sum``", "Total number of uncached write requests to L2 cache"
+  "``TCP_TCC_UC_ATOMIC_REQ_sum``", "Total number of uncached atomic requests to L2 cache"
+  "``TCP_TCC_WRITE_REQ_sum``", "Total number of write requests to L2 cache"
+  "``TCP_TCR_TCP_STALL_CYCLES_sum``", "Total number of cycles texture cache router stalls vector L1d"
+  "``TCP_TD_TCP_STALL_CYCLES_sum``", "Total number of cycles texture data unit stalls vector L1d"
+  "``TCP_TOTAL_ACCESSES_sum``", "Total number of vector L1d accesses"
+  "``TCP_TOTAL_READ_sum``", "Total number of vector L1d read accesses"
+  "``TCP_TOTAL_WRITE_sum``", "Total number of vector L1d write accesses"
+  "``TCP_TOTAL_ATOMIC_WITH_RET_sum``", "Total number of vector L1d atomic requests with return"
+  "``TCP_TOTAL_ATOMIC_WITHOUT_RET_sum``", "Total number of vector L1d atomic requests without return"
+  "``TCP_TOTAL_WRITEBACK_INVALIDATES_sum``", "Total number of vector L1d writebacks and invalidates"
+  "``TCP_VOLATILE_sum``", "Total number of L1 volatile pixels or buffers from texture addressing unit"
+  "``TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum``", "Total number of cycles tag RAM conflict stalls on a write"
+
+Hardware counter over all texture data unit instances
+--------------------------------------------------------
+
+.. csv-table::
+  :header: "Hardware counter", "Definition"
+
+  "``TD_ATOMIC_WAVEFRONT_sum``", "Total number of atomic wavefront instructions"
+  "``TD_COALESCABLE_WAVEFRONT_sum``", "Total number of coalescable wavefronts according to texture addressing unit"
+  "``TD_LOAD_WAVEFRONT_sum``", "Total number of wavefront instructions (read, write, atomic)"
+  "``TD_SPI_STALL_sum``", "Total number of cycles texture data unit is stalled by shader processor input"
+  "``TD_STORE_WAVEFRONT_sum``", "Total number of write wavefront instructions"
+  "``TD_TC_STALL_sum``", "Total number of cycles texture data unit is stalled waiting for texture cache data"
+  "``TD_TD_BUSY_sum``", "Total number of texture data unit busy cycles while it is processing or waiting for data"
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -0,0 +1,122 @@
+# AMD Instinct™ MI300 series microarchitecture
+
+The AMD Instinct MI300 series accelerators are based on the AMD CDNA 3
+architecture which was designed to deliver leadership performance for HPC, artificial intelligence (AI), and machine
+learning (ML) workloads. The AMD Instinct MI300 series accelerators are well-suited for extreme scalability and compute performance, running
+on everything from individual servers to the world’s largest exascale supercomputers.
+
+With the MI300 series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
+GPU computational elements of the processor along with the lower levels of the cache hierarchy.
+
+The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.
+
+```{figure} ../../data/conceptual/gpu-arch/image007.png
+---
+name: mi300-xcd
+align: center
+---
+XCD-level system architecture showing 40 Compute Units, each with 32 KB L1 cache, a Unified Compute System with 4 ACE Compute Accelerators, shared 4MB of L2 cache and an HWS Hardware Scheduler.
+```
+
+On the XCD, four Asynchronous Compute Engines (ACEs) send compute shader workgroups to the
+Compute Units (CUs). The XCD has 40 CUs: 38 active CUs at the aggregate level and 2 disabled CUs for
+yield management. The CUs all share a 4 MB L2 cache that serves to coalesce all memory traffic for the
+die. With less than half of the CUs of the AMD Instinct MI200 Series compute die, the AMD CDNA™ 3
+XCD die is a smaller building block. However, it uses more advanced packaging and the processor
+can include 6 or 8 XCDs for up to 304 CUs, roughly 40% more than MI250X.
+
+The MI300 Series integrate up to 8 vertically stacked XCDs, 8 stacks of
+High-Bandwidth Memory 3 (HBM3) and 4 I/O dies (containing system
+infrastructure) using the AMD Infinity Fabric™ technology as interconnect.
+
+The Matrix Cores inside the CDNA 3 CUs have significant improvements, emphasizing AI and machine
+learning, enhancing throughput of existing data types while adding support for new data types.
+CDNA 2 Matrix Cores support FP16 and BF16, while offering INT8 for inference. Compared to MI250X
+accelerators, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
+performance gain of 6.8 times for INT8. FP8 has a performance gain of 16 times compared to FP32,
+while TF32 has a gain of 4 times compared to FP32.
+
+```{list-table} Peak-performance capabilities of the MI300X for different data types.
+:header-rows: 1
+:name: mi300x-perf-table
+
+*
+  - Computation and Data Type
+  - FLOPS/CLOCK/CU
+  - Peak TFLOPS
+*
+  - Matrix FP64
+  - 256
+  - 163.4
+*
+  - Vector FP64
+  - 128
+  - 81.7
+*
+  - Matrix FP32
+  - 256
+  - 163.4
+*
+  - Vector FP32
+  - 256
+  - 163.4
+*
+  - Vector TF32
+  - 1024
+  - 653.7
+*
+  - Matrix FP16
+  - 2048
+  - 1307.4
+*
+  - Matrix BF16
+  - 2048
+  - 1307.4
+*
+  - Matrix FP8
+  - 4096
+  - 2614.9
+*
+  - Matrix INT8
+  - 4096
+  - 2614.9
+```
+
+The above table summarizes the aggregated peak performance of the AMD Instinct MI300X Open
+Compute Platform (OCP) Open Accelerator Modules (OAMs) for different data types and command
+processors. The middle column lists the peak performance (number of data elements processed in a
+single instruction) of a single compute unit if a SIMD (or matrix) instruction is submitted in each clock
+cycle. The third column lists the theoretical peak performance of the OAM. The theoretical aggregated
+peak memory bandwidth of the GPU is 5.3 TB per second.
+
+The following image shows the block diagram of the APU (left) and the OAM package (right) both
+connected via AMD Infinity Fabric™ network on-chip.
+
+```{figure} ../../data/conceptual/gpu-arch/image008.png
+---
+name: mi300-arch
+alt:
+align: center
+---
+MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
+```
+
+## Node-level architecture
+
+```{figure} ../../data/conceptual/gpu-arch/image009.png
+---
+name: mi300-node
+
+align: center
+---
+MI300 series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
+```
+
+The image above shows the node-level architecture of a system with AMD EPYC processors in a
+dual-socket configuration and eight AMD Instinct MI300X accelerators. The MI300X OAMs attach to the
+host system via PCIe Gen 5 x16 links (yellow lines). The GPUs are using seven high-bandwidth,
+low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected 8-GPU system.
+
+<!---
+We need performance data about the P2P communication here.
+-->
--- a/docs/conceptual/gpu-isolation.md
+++ b/docs/conceptual/gpu-isolation.md
@@ -1,3 +1,10 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU isolation techniques">
+  <meta name="keywords" content="GPU isolation techniques, UUID, universally unique identifier,
+  environment variables, virtual machines, AMD, ROCm">
+</head>
+
 # GPU isolation techniques

 Restricting the access of applications to a subset of GPUs, aka isolating
@@ -22,7 +29,7 @@ A list of device indices or {abbr}`UUID (universally unique identifier)`s
 that will be exposed to applications.

 Runtime
-: ROCm Platform Runtime. Applies to all applications using the user mode ROCm
+: ROCm Software Runtime. Applies to all applications using the user mode ROCm
  software stack.

 ```{code-block} shell
--- a/docs/conceptual/gpu-memory.md
+++ b/docs/conceptual/gpu-memory.md
@@ -1,9 +1,16 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU memory">
+  <meta name="keywords" content="GPU memory, VRAM, video random access memory, pageable
+  memory, pinned memory, managed memory, AMD, ROCm">
+</head>
+
 # GPU memory

 For the HIP reference documentation, see:

-* {doc}`hip:.doxygen/docBin/html/group___memory`
-* {doc}`hip:.doxygen/docBin/html/group___memory_m`
+* {doc}`hip:doxygen/html/group___memory`
+* {doc}`hip:doxygen/html/group___memory_m`

 Host memory exists on the host (e.g. CPU) of the machine in random access memory (RAM).

@@ -170,8 +177,8 @@ Fine-grained memory implies that up-to-date data may be made visible to others r

 | API                     | Flag                         | Coherence      |
 |-------------------------|------------------------------|----------------|
-| `hipExtMallocWithFlags` | `hipHostMallocDefault`       | Fine-grained   |
-| `hipExtMallocWithFlags` | `hipDeviceMallocFinegrained` | Coarse-grained |
+| `hipExtMallocWithFlags` | `hipDeviceMallocDefault`     | Coarse-grained |
+| `hipExtMallocWithFlags` | `hipDeviceMallocFinegrained` | Fine-grained   |

 | API                     | `hipMemAdvise` argument      | Coherence      |
 |-------------------------|------------------------------|----------------|
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -1,13 +1,22 @@
-# Using the LLVM ASan on a GPU (beta release)
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Using the LLVM ASan on a GPU">
+  <meta name="keywords" content="LLVM, ASan, address sanitizer, AddressSanitizer, instrumented
+  libraries, instrumented applications, AMD, ROCm">
+</head>
+
+# Using the AddressSanitizer on a GPU (beta release)

 The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.

 Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
-
 This document provides documentation on using ROCm ASan.
+
 For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).

-**Note**: The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
+:::{note}
+The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
+:::

 ## Compiling for ASan

@@ -17,17 +26,28 @@ Recommendations for doing this are:

 * Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
 * Add the following options to the existing compiler and linker options:
+  
  * `-fsanitize=address` - enables instrumentation
-  * `-shared-libsan` - use shared version of runtime
-  * `-g` - add debug info for improved reporting
-* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
-Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.

+  * `-shared-libsan` - use shared version of runtime
+
+  * `-g` - add debug info for improved reporting
+
+* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
+
+Other architectures are allowed, but their device code will not be instrumented, and a warning will be issued.
+
+:::{tip}
 It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
+:::
+
+:::{note}
+When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
+:::

 ### About compilation time

-When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all of the downstream components of the compiler toolchain and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances raised the compile time to an unacceptable level.
+When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all downstream components of the compiler toolchain, and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances increased compile time to an unacceptable level.

 There are a few options if the compile time becomes unacceptable:

@@ -47,9 +67,9 @@ For a complete ROCm GPU Sanitizer installation, including packages, instrumented
 ## Using AMD-supplied ASan instrumented libraries

 ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
-These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with g++. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
+These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with `g++`. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.

-These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
+These instrumented libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.

 When adjusting an application build to add instrumentation, linking against these instrumented libraries is unnecessary. For example, any `-L` `/opt/rocm-<version>/lib` compiler options need not be changed. However, the instrumented libraries should be used when the application is run. It is particularly important that the instrumented language runtimes, like `libamdhip64.so` and `librocm-core.so`, are used; otherwise, device invalid access detections may not be reported.

@@ -77,16 +97,25 @@ If it does not appear, when executed the application will quickly output an ASan

 * Ensure that the application `llvm-symbolizer` can be executed, and that it is located in `/opt/rocm-<version>/llvm/bin`. This executable is not strictly required, but if found is used to translate ("symbolize") a host-side instruction address into a more useful function name, file name, and line number (assuming the application has been built to include debug information).

-There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASAN runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASAN runtime. The device runtime only currently supports the default settings for the few relevant options.
+There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.

-There are two `ASAN_OPTION` flags of particular note.
+There are three `ASAN_OPTION` flags of note.

 * `halt_on_error=0/1 default 1`.

-This tells the ASAN runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
+  This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.

 * `detect_leaks=0/1 default 1`.
-This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSAN). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSAN suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
+
+  This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
+
+* `quarantine_size_mb=N default 256`
+
+  This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
+
+  :::{note}
+  Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
+  :::

 ## Runtime overhead

@@ -101,11 +130,12 @@ before the address is actually accessed by a load, store, or atomic
 instruction.
 This checking involves an additional load to "shadow" memory which
 records whether the address is "poisoned" or not, and additional logic
-that decides whether to produce an detection report or not.
+that decides whether to produce a detection report or not.

 This extra runtime work can cause the application to slow down by
 a factor of three or more, depending on how many memory accesses are
 executed.
+
 For heterogeneous applications, the shadow memory must be accessible by all devices
 and this can mean that shadow accesses from some devices may be more costly
 than non-shadow accesses.
@@ -125,7 +155,7 @@ instrumentation.

 ## Runtime reporting

-It is not the intention of this document to provide a detailed explanation of all of the types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.
+It is not the intention of this document to provide a detailed explanation of all types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.

 An invalid address detection report for the CPU always starts with

@@ -172,7 +202,7 @@ or

 currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.

-### Running with `rocgdb`
+## Running ASan with `rocgdb`

 `rocgdb` can be used to further investigate ASan detected errors, with some preparation.

@@ -189,7 +219,7 @@ This is solved by setting environment variable `LD_PRELOAD` to the path to the A
 amdclang++ -print-file-name=libclang_rt.asan-x86_64.so
 ```

-It is also recommended to set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.
+You should also set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.

 After starting `rocgdb` breakpoints can be set on the ASan runtime error reporting entry points of interest. For example, if an ASan error report includes

@@ -224,18 +254,180 @@ $ rocgdb <path to application>
 (gdb) c
 ```

-### Using ASan with a short HIP application
+## Using ASan with a short HIP application

-Refer to the following example to use ASan with a short HIP application,
+Consider the following simple and short demo of using the Address Sanitizer with a HIP application:

-https://github.com/Rmalavally/rocm-examples/blob/Rmalavally-patch-1/LLVM_ASAN/Using-Address-Sanitizer-with-a-Short-HIP-Application.md
+```C++

-### Known issues with using GPU sanitizer
+#include <cstdlib>
+#include <hip/hip_runtime.h>

-* Red zones must have limited size and it is possible for an invalid access to completely miss a red zone and not be detected.
+__global__ void
+set1(int *p)
+{
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+    p[i] = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    int m = std::atoi(argv[1]);
+    int n1 = std::atoi(argv[2]);
+    int n2 = std::atoi(argv[3]);
+    int c = std::atoi(argv[4]);
+    int *dp;
+    hipMalloc(&dp, m*sizeof(int));
+    hipLaunchKernelGGL(set1, dim3(n1), dim3(n2), 0, 0, dp);
+    int *hp = (int*)malloc(c * sizeof(int));
+    hipMemcpy(hp, dp, m*sizeof(int), hipMemcpyDeviceToHost);
+    hipDeviceSynchronize();
+    hipFree(dp);
+    free(hp);
+    std::puts("Done.");
+    return 0;
+}
+```
+
+This application will attempt to access invalid addresses for certain command line arguments. In particular, if `m < n1 * n2` some device threads will attempt to access
+unallocated device memory.
+
+Or, if `c < m`, the `hipMemcpy` function will copy past the end of the `malloc` allocated memory.
+
+**Note**: The `hipcc` compiler is used here for simplicity.
+
+Compiling without XNACK results in a warning.
+
+```bash
+$ hipcc -g --offload-arch=gfx90a:xnack- -fsanitize=address -shared-libsan mini.hip -o mini
+clang++: warning: ignoring` `-fsanitize=address' option for offload arch 'gfx90a:xnack-`, as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored]`.
+```
+
+The binary compiled above will run, but the GPU code will not be instrumented and the `m < n1 * n2` error will not be detected. Switching to `--offload-arch=gfx90a:xnack+` in the command above results in a warning-free compilation and an instrumented application. After setting `PATH`, `LD_LIBRARY_PATH` and `HSA_XNACK` as described earlier, a check of the binary with `ldd` yields the following,
+
+```bash
+$ ldd mini
+        linux-vdso.so.1 (0x00007ffd1a5ae000)
+        libclang_rt.asan-x86_64.so => /opt/rocm-6.1.0-99999/llvm/lib/clang/17.0.0/lib/linux/libclang_rt.asan-x86_64.so (0x00007fb9c14b6000)
+        libamdhip64.so.5 => /opt/rocm-6.1.0-99999/lib/asan/libamdhip64.so.5 (0x00007fb9bedd3000)
+        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fb9beba8000)
+        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fb9bea59000)
+        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fb9bea3e000)
+        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fb9be84a000)
+        libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fb9be844000)
+        libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fb9be821000)
+        librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fb9be817000)
+        libamd_comgr.so.2 => /opt/rocm-6.1.0-99999/lib/asan/libamd_comgr.so.2 (0x00007fb9b4382000)
+        libhsa-runtime64.so.1 => /opt/rocm-6.1.0-99999/lib/asan/libhsa-runtime64.so.1 (0x00007fb9b3b00000)
+        libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fb9b3af3000)
+        /lib64/ld-linux-x86-64.so.2 (0x00007fb9c2027000)
+        libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fb9b3ad7000)
+        libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fb9b3aa7000)
+        libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fb9b3a89000)
+        libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fb9b3a70000)
+        libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fb9b3a62000)
+
+```
+
+This confirms that the address sanitizer runtime is linked in, and the ASan instrumented version of the runtime libraries are used.
+Checking the `PATH` yields
+
+```bash
+$ which llvm-symbolizer
+/opt/rocm-6.1.0-99999/llvm/bin/llvm-symbolizer
+```
+
+Lastly, a check of the OS kernel version yields
+
+```bash
+$ uname -rv
+5.15.0-73-generic #80~20.04.1-Ubuntu SMP Wed May 17 14:58:14 UTC 2023
+```
+
+which indicates that the required HMM support (kernel version > 5.6) is available. This completes the necessary setup. Running with `m = 100`, `n1 = 11`, `n2 = 10` and `c = 100` should produce
+a report for an invalid access by the last 10 threads.
+
+```bash
+=================================================================
+==3141==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fb1410d2cc4
+WRITE of size 4 in workgroup id (10,0,0)
+  #0 0x7fb1410d2cc4 in set1(int*) at /home/dave/mini/mini.cpp:0:10
+
+Thread ids and accessed addresses:
+00 : 0x7fb14371d190 01 : 0x7fb14371d194 02 : 0x7fb14371d198 03 : 0x7fb14371d19c 04 : 0x7fb14371d1a0 05 : 0x7fb14371d1a4 06 : 0x7fb14371d1a8 07 : 0x7fb14371d1ac
+08 : 0x7fb14371d1b0 09 : 0x7fb14371d1b4
+
+0x7fb14371d190 is located 0 bytes after 400-byte region [0x7fb14371d000,0x7fb14371d190)
+allocated by thread T0 here:
+    #0 0x7fb151c76828 in hsa_amd_memory_pool_allocate /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:692:3
+    #1 ...
+
+    #12 0x7fb14fb99ec4 in hipMalloc /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:568:3
+    #13 0x226630 in hipError_t hipMalloc<int>(int**, unsigned long) /opt/rocm-6.1.0-99999/include/hip/hip_runtime_api.h:8367:12
+    #14 0x226630 in main /home/dave/mini/mini.cpp:19:5
+    #15 0x7fb14ef02082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+Shadow bytes around the buggy address:
+  0x7fb14371cf00: ...
+
+=>0x7fb14371d180: 00 00[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa
+  0x7fb14371d200: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==3141==ABORTING
+```
+
+Running with `m = 100`, `n1 = 10`, `n2 = 10` and `c = 99` should produce a report for an invalid copy.
+
+```shell
+=================================================================
+==2817==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x514000150dcc at pc 0x7f5509551aca bp 0x7ffc90a7ae50 sp 0x7ffc90a7a610
+WRITE of size 400 at 0x514000150dcc thread T0
+    #0 0x7f5509551ac9 in __asan_memcpy /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3
+    #1 ...
+
+    #9 0x7f5507462a28 in hipMemcpy_common(void*, void const*, unsigned long, hipMemcpyKind, ihipStream_t*) /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:637:10
+    #10 0x7f5507464205 in hipMemcpy /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:642:3
+    #11 0x226844 in main /home/dave/mini/mini.cpp:22:5
+    #12 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+    #13 0x22605d in _start (/home/dave/mini/mini+0x22605d)
+
+0x514000150dcc is located 0 bytes after 396-byte region [0x514000150c40,0x514000150dcc)
+allocated by thread T0 here:
+    #0 0x7f5509553dcf in malloc /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
+    #1 0x226817 in main /home/dave/mini/mini.cpp:21:21
+    #2 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+SUMMARY: AddressSanitizer: heap-buffer-overflow /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3 in __asan_memcpy
+Shadow bytes around the buggy address:
+  0x514000150b00: ...
+
+=>0x514000150d80: 00 00 00 00 00 00 00 00 00[04]fa fa fa fa fa fa
+  0x514000150e00: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==2817==ABORTING
+```
+
+## Known issues with using GPU sanitizer
+
+* Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.

 * Lack of detection or false reports can be caused by the runtime not properly maintaining red zone shadows.

 * Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.

-* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside of any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+* It can also be the case that a memory fault is reported for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
+
+* There is currently a bug which can result in memory faults being reported when running instrumented device code which makes use of `malloc`, `free`, `new`, or `delete`.
+
+* There is currently a bug which can result in undefined symbols being reported at compile time when instrumented device code makes use of `new` and `delete`.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,15 +8,11 @@ import shutil
 import jinja2
 import os

-from rocm_docs import ROCmDocs
-
-# Environement to process Jinja templates.
+# Environment to process Jinja templates.
 jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))

 # Jinja templates to render out.
-templates = [
-
-]
+templates = []

 # Render templates and output files without the last extension.
 # For example: 'install.md.jinja' becomes 'install.md'.
@@ -25,7 +21,6 @@ for template in templates:
    with open(os.path.splitext(template)[0], 'w') as file:
        file.write(rendered)

-shutil.copy2('../CONTRIBUTING.md','./contribute/index.md')
 shutil.copy2('../RELEASE.md','./about/release-notes.md')
 # Keep capitalization due to similar linking on GitHub's markdown preview.
 shutil.copy2('../CHANGELOG.md','./about/CHANGELOG.md')
@@ -42,9 +37,9 @@ latex_elements = {
 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
-version = "5.7.1"
-release = "5.7.1"
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = "6.1.0"
+release = "6.1.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""
@@ -52,9 +47,14 @@ all_article_info_author = ""
 # pages with specific settings
 article_pages = [
    {
-        "file":"release",
+        "file":"about/release-notes",
        "os":["linux", "windows"],
-        "date":"2023-07-27"
+        "date":"2024-04-16"
+    },
+    {
+        "file":"about/CHANGELOG",
+        "os":["linux", "windows"],
+        "date":"2024-04-16"
    },

    {"file":"install/windows/install-quick", "os":["windows"]},
@@ -74,9 +74,6 @@ article_pages = [
    {"file":"install/windows/cli/index", "os":["windows"]},
    {"file":"install/windows/gui/index", "os":["windows"]},

-    {"file":"about/compatibility/linux-support", "os":["linux"]},
-    {"file":"about/compatibility/windows-support", "os":["windows"]},
-
    {"file":"about/compatibility/docker-image-support-matrix", "os":["linux"]},
    {"file":"about/compatibility/user-kernel-space-compat-matrix", "os":["linux"]},

@@ -87,21 +84,47 @@ article_pages = [
    {"file":"how-to/system-debugging", "os":["linux"]},
    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},

-    {"file":"rocm-a-z", "os":["linux", "windows"]},
+    {"file":"how-to/rocm-for-ai/index", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/install", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/train-a-model", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/deploy-your-model", "os":["linux"]},
+    {"file":"how-to/rocm-for-ai/hugging-face-models", "os":["linux"]},

+    {"file":"how-to/rocm-for-hpc/index", "os":["linux"]},
+
+    {"file":"how-to/llm-fine-tuning-optimization/index", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/overview", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/multi-gpu-fine-tuning-and-inference", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/llm-inference-frameworks", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/model-acceleration-libraries", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/model-quantization", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/optimizing-with-composable-kernel", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/optimizing-triton-kernel", "os":["linux"]},
+    {"file":"how-to/llm-fine-tuning-optimization/profiling-and-debugging", "os":["linux"]},
 ]

 exclude_patterns = ['temp']

 external_toc_path = "./sphinx/_toc.yml"

-docs_core = ROCmDocs("ROCm Documentation")
-docs_core.setup()
+extensions = ["rocm_docs", "sphinx_reredirects"]

 external_projects_current_project = "rocm"

-for sphinx_var in ROCmDocs.SPHINX_VARS:
-    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
+html_theme = "rocm_docs_theme"
+html_theme_options = {"flavor": "rocm-docs-home"}
+
+html_static_path = ["sphinx/static/css"]
+html_css_files = ["rocm_custom.css"]
+
+html_title = "ROCm Documentation"
+
 html_theme_options = {
    "link_main_doc": False
 }
+
+redirects = {
+     "reference/openmp/openmp": "../../about/compatibility/openmp.html"
+}
--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -1,3 +1,10 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Building ROCm documentation">
+  <meta name="keywords" content="documentation, Visual Studio Code, GitHub, command line,
+  AMD, ROCm">
+</head>
+
 # Building documentation

 You can build our documentation via GitHub (in a pull request) or locally (using the command line or
@@ -24,11 +31,6 @@ Use the Python Virtual Environment (`venv`) and run the following commands from
 ```sh
 python3 -mvenv .venv

-# Windows
-.venv/Scripts/python -m pip install -r docs/sphinx/requirements.txt
-.venv/Scripts/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
-
-# Linux
 .venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
 .venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
 ```
@@ -122,12 +124,12 @@ documentation locally using Visual Studio (VS) Code. Follow these steps to confi
      }
    ```

-    > (Implementation detail: two problem matchers were needed to be defined,
+    > Implementation detail: two problem matchers were needed to be defined,
    > because VS Code doesn't tolerate some problem information being potentially
    > absent. While a single regex could match all types of errors, if a capture
    > group remains empty (the line number doesn't show up in all warning/error
    > messages) but the `pattern` references said empty capture group, VS Code
-    > discards the message completely.)
+    > discards the message completely.

 4. Configure the Python virtual environment (`venv`).

--- a/docs/contribute/contributing.md
+++ b/docs/contribute/contributing.md
@@ -0,0 +1,112 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Contributing to ROCm">
+  <meta name="keywords" content="ROCm, contributing, contribute, maintainer, contributor">
+</head>
+
+# Contribute to ROCm documentation
+
+All ROCm projects are GitHub-based, so if you want to contribute, you can do so by:
+
+* [Submitting a pull request in the appropriate GitHub repository](#submit-a-pull-request)
+* [Creating an issue in the appropriate GitHub repository](#create-an-issue)
+* [Suggesting a new feature](#suggest-a-new-feature)
+
+```{important}
+By creating a pull request (PR), you agree to allow your contribution to be licensed under the terms of the
+LICENSE.txt file in the corresponding repository. Different repositories may use different licenses.
+```
+
+## Submit a pull request
+
+To make edits to our documentation via PR, follow these steps:
+
+1. Identify the repository and the file you want to update. For example, to update this page, you would
+  need to modify content located in this file:
+  `https://github.com/ROCm/ROCm/blob/develop/docs/contribute/contributing.md`
+
+2. (optional, but recommended) Fork the repository.
+
+3. Clone the repository locally and (optionally) add your fork. Select the green 'Code' button and copy
+   the URL (e.g., `git@github.com:ROCm/ROCm.git`).
+
+   * From your terminal, run:
+
+      ```bash
+      git clone git@github.com:ROCm/ROCm.git
+      ```
+
+   * Optionally add your fork to this local copy of the repository by running:
+
+      ```bash
+      git add remote <name-of-my-fork> <git@github.com:my-username/ROCm.git>
+      ```
+
+      To get the URL of your fork, go to your GitHub profile, select the fork and click the green 'Code'
+      button (the same process you followed to get the main GitHub repository URL).
+
+4. Change directory into your local copy of the repository, and run ``git pull`` (or ``git pull origin develop``) to ensure your local copy has the most recent content.
+
+5. Create and checkout a new branch using the following command:
+
+    ```bash
+    git checkout -b <branch_name>
+    ```
+
+6. Change directory into the `./docs` folder and make any documentation changes locally using your preferred code editor. Follow the guidelines listed on the
+   [documentation structure](./doc-structure.md) page.
+
+7. Optionally run a local test build of the documentation to ensure the content builds and looks as expected. In your terminal, run the following commands from within the `./docs` folder of your cloned repository:
+
+     ```bash
+     pip3 install -r sphinx/requirements.txt  # You only need to run this command once
+     python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+     ```
+
+    The build output files are located in the `docs/_build` folder. To preview your build, open the index file
+    (`docs/_build/html/index.html`) file. For more information, see [Building documentation](building.md). To learn
+    more about our build tools, see [Documentation toolchain](toolchain.md).
+
+8. Commit your changes and push them to GitHub by running:
+
+    ```bash
+    git add <path-to-my-modified-file> # To add all modified files, you can use: git add .
+    git commit -m "my-updates"
+    git push <name-of-my-fork>
+    ```
+
+    After pushing, you will get a GitHub link in the terminal output. Copy this link and paste it into a
+    browser to create your PR.
+
+## Create an issue
+
+1. To create a new GitHub issue, select the 'Issues' tab in the appropriate repository
+  (e.g., https://github.com/ROCm/ROCm/issues).
+2. Use the search bar to make sure the issue doesn't already exist.
+3. If your issue is not already listed, select the green 'New issue' button to the right of the page. Select
+  the type of issue and fill in the resulting template.
+
+### General issue guidelines
+
+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
+  comment or post to provide additional details, such as how you reproduced this issue.
+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
+  You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
+  your issue as being the same as the existing issue, we'll close the duplicate.
+* If your issue doesn't exist, use the issue template to file a new issue.
+  * When filing an issue, be sure to provide as much information as possible, including script output so
+    we can collect information about your configuration. This helps reduce the time required to
+    reproduce your issue.
+  * Check your issue regularly, as we may require additional information to successfully reproduce the
+    issue.
+
+## Suggest a new feature
+
+Use the [GitHub Discussion forum](https://github.com/ROCm/ROCm/discussions)
+(Ideas category) to propose new features. Our maintainers are happy to provide direction and
+feedback on feature development.
+
+## Future development workflow
+
+The current ROCm development workflow is GitHub-based. If, in the future, we change this platform,
+the tools and links may change. In this instance, we will update contribution guidelines accordingly.
--- a/docs/contribute/doc-structure.md
+++ b/docs/contribute/doc-structure.md
@@ -0,0 +1,219 @@
+# Documentation structure
+
+Our documentation follows the Pitchfork folder structure. Most documentation files are stored in the
+`/docs` folder. Some special files (such as release, contributing, and changelog) are stored in the root
+(`/`) folder.
+
+All images are stored in the `/docs/data` folder. An image's file path mirrors that of the documentation
+file where it is used.
+
+Our naming structure uses kebab case; for example, `my-file-name.rst`.
+
+## Supported formats and syntax
+
+Our documentation includes both Markdown and RST files. We are gradually transitioning existing
+Markdown to RST in order to more effectively meet our documentation needs. When contributing,
+RST is preferred; if you must use Markdown, use GitHub-flavored Markdown.
+
+We use [Sphinx Design](https://sphinx-design.readthedocs.io/en/latest/index.html) syntax and compile
+our API references using [Doxygen](https://www.doxygen.nl/).
+
+The following table shows some common documentation components and the syntax convention we
+use for each:
+
+<table>
+<tr>
+<th>Component</th>
+<th>RST syntax</th>
+</tr>
+<tr>
+<td>Code blocks</td>
+<td>
+
+```rst
+
+.. code-block:: language-name
+
+  My code block.
+
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Cross-referencing internal files</td>
+<td>
+
+```rst
+
+:doc:`Title <../path/to/file/filename>`
+
+```
+
+</td>
+</tr>
+<tr>
+<td>External links</td>
+<td>
+
+```rst
+
+`link name  <URL>`_
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Headings</td>
+<td>
+
+```rst
+
+******************
+Chapter title (H1)
+******************
+
+Section title (H2)
+===============
+
+Subsection title (H3)
+---------------------
+
+Sub-subsection title (H4)
+^^^^^^^^^^^^^^^^^^^^
+
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Images</td>
+<td>
+
+```rst
+
+.. image:: image1.png
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Internal links</td>
+<td>
+
+```rst
+
+1. Add a tag to the section you want to reference:
+
+.. _my-section-tag: section-1
+
+Section 1
+==========
+
+2. Link to your tag:
+
+As shown in :ref:`section-1`.
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Lists</td>
+<td>
+
+```rst
+
+# Ordered (numbered) list item
+
+* Unordered (bulleted) list item
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Math (block)</td>
+<td>
+
+```rst
+
+.. math::
+
+  A = \begin{pmatrix}
+          0.0 & 1.0 & 1.0 & 3.0 \\
+          4.0 & 5.0 & 6.0 & 7.0 \\
+        \end{pmatrix}
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Math (inline)</td>
+<td>
+
+```rst
+
+:math:`2 \times 2 `
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Notes</td>
+<td>
+
+```rst
+
+.. note::
+
+  My note here.
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Tables</td>
+<td>
+
+```rst
+
+.. csv-table::  Optional title here
+  :widths: 30, 70  #optional column widths
+  :header: "entry1 header", "entry2 header"
+
+   "entry1", "entry2"
+
+```
+
+</td>
+</tr>
+</table>
+
+## Language and style
+
+We use the
+[Google developer documentation style guide](https://developers.google.com/style/highlights) to
+guide our content.
+
+Font size and type, page layout, white space control, and other formatting
+details are controlled via
+[rocm-docs-core](https://github.com/ROCm/rocm-docs-core). If you want to notify us
+of any formatting issues, create a pull request in our
+[rocm-docs-core](https://github.com/ROCm/rocm-docs-core) GitHub repository.
+
+## Building our documentation
+
+<!--  % TODO: Fix the link to be able to work at every files  -->
+To learn how to build our documentation, refer to
+[Building documentation](./building.md).
--- a/docs/contribute/feedback.md
+++ b/docs/contribute/feedback.md
@@ -1,27 +1,31 @@
-# How to provide feedback for ROCm documentation
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Providing feedback for ROCm documentation">
+  <meta name="keywords" content="documentation, pull request, GitHub, AMD, ROCm">
+</head>

-There are four standard ways to provide feedback for this repository.
+# Providing feedback
+
+There are four standard ways to provide feedback on this repository.

 ## Pull request

 All contributions to ROCm documentation should arrive via the
 [GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
-targeting the develop branch of the repository. If you are unable to contribute
-via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
+targeting the develop branch of the repository.
+
+For more in-depth information on creating a pull request (PR), see
+[Contributing](./contributing.md).

 ## GitHub discussions

 To ask questions or view answers to frequently asked questions, refer to
-[GitHub Discussions](https://github.com/RadeonOpenCompute/ROCm/discussions).
+[GitHub Discussions](https://github.com/ROCm/ROCm/discussions).
 On GitHub Discussions, in addition to asking and answering questions,
 members can share updates, have open-ended conversations,
 and follow along on via public announcements.

 ## GitHub issue

-Issues on existing or absent docs can be filed as
-[GitHub Issues](https://github.com/RadeonOpenCompute/ROCm/issues).
-
-## Email
-
-Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
+Issues on existing or absent documentation can be filed in
+[GitHub Issues](https://github.com/ROCm/ROCm/issues).
--- a/docs/contribute/toolchain.md
+++ b/docs/contribute/toolchain.md
@@ -1,71 +1,65 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm documentation toolchain">
+  <meta name="keywords" content="documentation, toolchain, Sphinx, Doxygen, MyST, AMD, ROCm">
+</head>
+
 # ROCm documentation toolchain

 Our documentation relies on several open source toolchains and sites.

 ## `rocm-docs-core`

-[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) is an AMD-maintained
-project that applies customization for our documentation. This
-project is the tool most ROCm repositories use as part of the documentation
-build. It is also available as a [pip package on PyPI](https://pypi.org/project/rocm-docs-core/).
+[rocm-docs-core](https://github.com/ROCm/rocm-docs-core) is an AMD-maintained
+project that applies customization for our documentation. This project is the tool most ROCm
+repositories use as part of the documentation build. It is also available as a
+[pip package on PyPI](https://pypi.org/project/rocm-docs-core/).

-See the user and developer guides for rocm-docs-core at {doc}`rocm-docs-core documentation<rocm-docs-core:index>`.
+See the user and developer guides for rocm-docs-core at
+{doc}`rocm-docs-core documentation<rocm-docs-core:index>`.

 ## Sphinx

-[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator
-originally used for Python. It is now widely used in the open source community.
-Originally, Sphinx supported reStructuredText (RST) based documentation, but
-Markdown support is now available.
-ROCm documentation plans to default to Markdown for new projects.
-Existing projects using RST are under no obligation to convert to Markdown. New
-projects that believe Markdown is not suitable should contact the documentation
-team prior to selecting RST.
-
-## Read the Docs
-
-[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds
-and hosts the HTML documentation generated using Sphinx to our end users.
-
-## Doxygen
-
-[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts
-information from inline code.
-ROCm projects typically use Doxygen for public API documentation unless the
-upstream project uses a different tool.
-
-### Breathe
-
-[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin to integrate Doxygen
-content.
-
-### MyST
-
-[Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended
-flavor of Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and Sphinx.
-It is integrated into ROCm documentation by the Sphinx extension [`myst-parser`](https://myst-parser.readthedocs.io/en/latest/).
-A cheat sheet that showcases how to use the MyST syntax is available over at
-the [Jupyter reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html).
+[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator originally used for
+Python. It is now widely used in the open source community.

 ### Sphinx External ToC

-[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html)
-is a Sphinx extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
-based on a YAML file that specifies the table of contents.
-It was selected due to its flexibility that allows scripts to operate on the
-YAML file. Please transition to this file for the project's navigation. You can
-see the `_toc.yml.in` file in this repository in the `docs/sphinx` folder for an
-example.
+[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html) is a Sphinx
+extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
+based on a YAML file (`_toc.yml.in`) that contains the table of contents.

 ### Sphinx-book-theme

-[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme
-that defines the base appearance for ROCm documentation.
-ROCm documentation applies some customization,
-such as a custom header and footer on top of the Sphinx Book Theme.
+[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme that
+defines the base appearance for ROCm documentation. ROCm documentation applies some
+customization, such as a custom header and footer on top of the Sphinx Book Theme.

-### Sphinx design
+### Sphinx Design

-[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that adds design
-functionality.
-ROCm documentation uses Sphinx Design for grids, cards, and synchronized tabs.
+[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that
+adds design functionality. ROCm documentation uses Sphinx Design for grids, cards, and synchronized
+tabs.
+
+## Doxygen
+
+[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts information from inline
+code. ROCm projects typically use Doxygen for public API documentation (unless the upstream project
+uses a different tool).
+
+## Breathe
+
+[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin to integrate Doxygen content.
+
+## MyST
+
+[Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended flavor of
+Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and
+Sphinx. It's integrated into ROCm documentation by the Sphinx extension
+[`myst-parser`](https://myst-parser.readthedocs.io/en/latest/).
+A MyST syntax cheat sheet is available on the [Jupyter reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html) site.
+
+## Read the Docs
+
+[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds and hosts the HTML
+documentation generated using Sphinx to our end users.
--- a/docs/data/about/compatibility/floating-point-data-types.png
+++ b/docs/data/about/compatibility/floating-point-data-types.png
--- a/docs/data/banner-compatibility.jpg
+++ b/docs/data/banner-compatibility.jpg
--- a/docs/data/banner-conceptual.jpg
+++ b/docs/data/banner-conceptual.jpg
--- a/docs/data/banner-howto.jpg
+++ b/docs/data/banner-howto.jpg
--- a/docs/data/banner-installation.jpg
+++ b/docs/data/banner-installation.jpg
--- a/docs/data/banner-reference.jpg
+++ b/docs/data/banner-reference.jpg
--- a/docs/data/banner-text.xcf
+++ b/docs/data/banner-text.xcf
--- a/docs/data/banner.png
+++ b/docs/data/banner.png
--- a/docs/data/conceptual/gpu-arch/image007.png
+++ b/docs/data/conceptual/gpu-arch/image007.png
--- a/docs/data/conceptual/gpu-arch/image008.png
+++ b/docs/data/conceptual/gpu-arch/image008.png
--- a/docs/data/conceptual/gpu-arch/image009.png
+++ b/docs/data/conceptual/gpu-arch/image009.png
--- a/docs/data/contribute/clone-repo.png
+++ b/docs/data/contribute/clone-repo.png
--- a/docs/data/contribute/fork-repo.png
+++ b/docs/data/contribute/fork-repo.png
--- a/docs/data/how-to/framework_install_2024_05_23.png
+++ b/docs/data/how-to/framework_install_2024_05_23.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/attention-module.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-compilation.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-kernel_launch.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-operation_flow.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-root_instance.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/ck-template_parameters.jpg
--- a/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/compute-unit.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/occupancy-vgpr.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omniperf-analysis.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/omnitrace-timeline.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
+++ b/docs/data/how-to/llm-fine-tuning-optimization/perfetto-trace.svg
--- a/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/profiling-perfetto-ui.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/tunableop.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/vllm-single-gpu-log.png
--- a/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
+++ b/docs/data/how-to/llm-fine-tuning-optimization/weight-update.png
--- a/docs/data/install/linux/linux001.png
+++ b/docs/data/install/linux/linux001.png
--- a/docs/data/install/linux/linux002.png
+++ b/docs/data/install/linux/linux002.png
--- a/docs/data/install/linux/linux003.png
+++ b/docs/data/install/linux/linux003.png
--- a/docs/data/install/linux/linux004.png
+++ b/docs/data/install/linux/linux004.png
--- a/docs/data/install/magma-install/magma005.png
+++ b/docs/data/install/magma-install/magma005.png
--- a/docs/data/install/magma-install/magma006.png
+++ b/docs/data/install/magma-install/magma006.png
--- a/docs/data/install/windows/000-settings-dark.png
+++ b/docs/data/install/windows/000-settings-dark.png
--- a/docs/data/install/windows/000-settings-light.png
+++ b/docs/data/install/windows/000-settings-light.png
--- a/docs/data/install/windows/000-setup-icon.png
+++ b/docs/data/install/windows/000-setup-icon.png
--- a/docs/data/install/windows/001-about-dark.png
+++ b/docs/data/install/windows/001-about-dark.png
--- a/docs/data/install/windows/001-about-light.png
+++ b/docs/data/install/windows/001-about-light.png
--- a/docs/data/install/windows/001-uac-dark.png
+++ b/docs/data/install/windows/001-uac-dark.png
--- a/docs/data/install/windows/001-uac-light.png
+++ b/docs/data/install/windows/001-uac-light.png
--- a/docs/data/install/windows/002-initializing.png
+++ b/docs/data/install/windows/002-initializing.png
--- a/docs/data/install/windows/003-detecting-system-config.png
+++ b/docs/data/install/windows/003-detecting-system-config.png
--- a/docs/data/install/windows/004-installer-window.png
+++ b/docs/data/install/windows/004-installer-window.png
--- a/docs/data/install/windows/012-install-progress.png
+++ b/docs/data/install/windows/012-install-progress.png
--- a/Show More
+++ b/Show More