Update documentation requirements

Merge pull request #3570 from amd-jnovotny/peak-tflops-typo-docs600
Fix typo for TFLOPs metric in MI250 architecture page: cherry pick to docs/6.0.0
2026-01-10 15:18:11 -05:00 · 2024-09-16 10:13:05 -08:00 · 2024-08-12 13:14:51 -04:00 · 2024-08-12 10:28:15 -04:00 · 2024-06-18 14:15:35 -07:00 · 2024-06-12 13:00:01 -07:00
230 changed files with 10825 additions and 9864 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1 @@
-* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus
+* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus @LisaDelaney
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -10,3 +10,4 @@ updates:
    open-pull-requests-limit: 10
    schedule:
      interval: "daily"
+    versioning-strategy: increase
--- a/.github/workflows/issue_retrieval.yml
+++ b/.github/workflows/issue_retrieval.yml
@@ -0,0 +1,22 @@
+name: Issue retrieval
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  auto-retrieve:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Generate a token
+        id: generate_token
+        uses: actions/create-github-app-token@v1
+        with:
+          app_id: ${{ secrets.ACTION_APP_ID }}
+          private_key: ${{ secrets.ACTION_PEM }}
+      - name: 'Retrieve Issue'
+        uses: abhimeda/rocm_issue_management@main
+        with:
+          authentication-token: ${{ steps.generate_token.outputs.token }}
+          github-organization: 'ROCm'
+          project-num: '6'
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -6,7 +6,7 @@ on:
    - develop
    - main
    - 'docs/*'
-    - 'roc**'    
+    - 'roc**'
  pull_request:
    branches: 
    - develop
@@ -14,47 +14,7 @@ on:
    - 'docs/*'
    - 'roc**'

-concurrency:
-  group: ${{ github.ref }}-${{ github.workflow }}
-  cancel-in-progress: true
-
 jobs:
-  lint-rest:
-    name: "RestructuredText"
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-    - name: Install rst-lint
-      run: pip install restructuredtext-lint
-    - name: Lint ResT files
-      run: rst-lint ${{ join(github.workspace, '/docs') }}
-
-  lint-md:
-    name: "Markdown"
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-    - name: Use markdownlint-cli2
-      uses: DavidAnson/markdownlint-cli2-action@v10.0.1
-      with:
-        globs: '**/*.md'
-
-  spelling:
-    name: "Spelling"
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-    - name: Fetch config
-      shell: sh
-      run: |
-        curl --silent --show-error --fail --location https://raw.github.com/RadeonOpenCompute/rocm-docs-core/develop/.spellcheck.yaml -O
-        curl --silent --show-error --fail --location https://raw.github.com/RadeonOpenCompute/rocm-docs-core/develop/.wordlist.txt >> .wordlist.txt
-    - name: Run spellcheck
-      uses: rojopolis/spellcheck-github-actions@0.30.0
-    - name: On fail
-      if: failure()
-      run: |
-        echo "Please check for spelling mistakes or add them to '.wordlist.txt' in either the root of this project or in rocm-docs-core."
+  call-workflow-passing-data:
+    name: Documentation
+    uses: RadeonOpenCompute/rocm-docs-core/.github/workflows/linting.yml@develop
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ _doxygen/
 _readthedocs/

 # avoid duplicating contributing.md due to conf.py
-docs/contributing.md
-docs/release.md
 docs/CHANGELOG.md
+docs/contribute/index.md
+docs/about/release-notes.md
+docs/about/CHANGELOG.md
--- a/.markdownlint-cli2.yaml
+++ b/.markdownlint-cli2.yaml
@@ -1,5 +1,7 @@
 config:
  default: true
+  MD004:
+    style: asterisk
  MD013: false
  MD026:
    punctuation: '.,;:!'
@@ -8,7 +10,9 @@ config:
  MD033: false
  MD034: false
  MD041: false
+  MD051: false
 ignores:
  - CHANGELOG.md
+  - docs/CHANGELOG.md
  - "{,docs/}{RELEASE,release}.md"
  - tools/autotag/templates/**/*.md
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,12 +3,19 @@

 version: 2

+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.10"
+   apt_packages:
+     - "doxygen"
+     - "graphviz" # For dot graphs in doxygen
+
+python:
+   install:
+   - requirements: docs/sphinx/requirements.txt
+
 sphinx:
   configuration: docs/conf.py

-formats: [htmlzip, pdf, epub]
-
-python:
-   version: "3.8"
-   install:
-   - requirements: docs/sphinx/requirements.txt
+formats: []
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -1,49 +1,584 @@
-# file_reorg
-FHS
-Filesystem
-filesystem
-incrementing
-rocm
-# gpu_aware_mpi
-DMA
-GDR
-HCA
-MPI
-MVAPICH
-Mellanox's
-NIC
-OFED
-OSU
-OpenFabrics
-PeerDirect
-RDMA
-UCX
-ib_core
-# isv_deployment_win
 ABI
-# linear algebra
-LAPACK
-MMA
+activations
+addr
+AddressSanitizer
+AlexNet
+alloc
+allocator
+allocators
+ALU
+AMD
+AMDGPU
+amdgpu
+AMDGPUs
+AMDMIGraphX
+AMI
+AOCC
+AOMP
+api
+APIC
+APIs
+Arb
+ASan
+ASIC
+ASICs
+ASm
+ATI
+atmi
+atomics
+autogenerated
+avx
+awk
+backend
 backends
+benchmarking
+bilinear
+BitCode
+BLAS
+Blit
+blit
+BMC
+buildable
+bursty
+bzip
+cacheable
+CCD
+cd
+CDNA
+CentOS
+centric
+changelog
+chiplet
+CIFAR
+CLI
+CLion
+CMake
+cmake
+CMakeLists
+CMakePackage
+cmd
+coalescable
+codename
+Codespaces
+comgr
+Commitizen
+CommonMark
+completers
+composable
+concretization
+Concretized
+Conda
+config
+conformant
+convolutional
+convolves
+CoRR
+CP
+CPC
+CPF
+CPP
+CPU
+CPUs
+CSC
+CSE
+CSn
+csn
+CSV
+CTests
+CU
+cuBLAS
+CUDA
+cuFFT
+cuLIB
+cuRAND
+CUs
 cuSOLVER
 cuSPARSE
-# openmp
-ICV
-Multithreaded
-# tuning_guides
-BMC
+CXX
+dataset
+datasets
+dataspace
+datatype
+datatypes
+dbgapi
+de
+deallocation
+denormalize
+Dependabot
+deserializers
+detections
+dev
+DevCap
+devicelibs
+devsel
 DGEMM
+disambiguates
+distro
+DL
+DMA
+DNN
+DNNL
+Dockerfile
+Doxygen
+DPM
+DRI
+DW
+DWORD
+el
+enablement
+endpgm
+env
+epilog
+EPYC
+ESXi
+ethernet
+exascale
+executables
+ffmpeg
+FFT
+FFTs
+FHS
+filesystem
+Filesystem
+Flang
+FMA
+Fortran
+fortran
+FP
+galb
+gcc
+GCD
+GCDs
+GCN
+GDB
+gdb
+GDDR
+GDR
+GDS
+GEMM
+GEMMs
+GenZ
+gfortran
+gfx
+GIM
+github
+Gitpod
+GL
+GLXT
+GMI
+gnupg
+GPG
+GPR
+GPU
+GPUs
+grayscale
+GRBM
+gzip
+Haswell
+HBM
+HCA
+heterogenous
+hipamd
+hipBLAS
+hipblas
+hipBLASLt
+HIPCC
+hipCUB
+hipcub
+HIPExtension
+hipFFT
+hipfft
+hipfort
+HIPIFY
+hipify
+hipLIB
+hipRAND
+hipSOLVER
+hipsolver
+hipSPARSE
+hipsparse
+hipSPARSELt
+hipTensor
+HPC
 HPCG
+HPE
 HPL
+HSA
+hsa
+hsakmt
+HWE
+ib_core
+ICV
+IDE
+IDEs
+ImageNet
+IMDB
+inband
+incrementing
+inferencing
+InfiniBand
+inflight
+init
+Inlines
+inlining
+installable
+IntelliSense
+interprocedural
+Intersphinx
+intra
+invariants
+invocating
+Ioffe
+IOMMU
+IOP
 IOPM
-# windows
+IOV
+ipo
+IRQ
+ISA
+ISV
+ISVs
+JSON
+Jupyter
+kdb
+KFD
+Khronos
+KVM
+LAPACK
+LCLK
+LDS
+libfabric
+libjpeg
+libs
+linearized
+linter
+linux
+llvm
+LLVM
+localscratch
+logits
+lossy
+LSAN
+LTS
+Makefile
+Makefiles
+matchers
+Matplotlib
+Mellanox's
+MEM
+MERCHANTABILITY
+MFMA
+microarchitecture
+MIGraphX
+migraphx
+MIOpen
+miopen
+MIOpenGEMM
+miopengemm
+MIVisionX
+mivisionx
+mkdir
+mlirmiopen
+MMA
+MMIO
+MMIOH
+MNIST
+MPI
+MSVC
+mtypes
+Multicore
+Multithreaded
+MVAPICH
+mvffr
+MyEnvironment
+MyST
+namespace
+namespaces
+Nano
+Navi
+NBIO
+NBIOs
+NIC
+NICs
+Noncoherently
+NPS
+NUMA
+NumPy
+numref
+NVCC
+NVPTX
+OAM
+OAMs
+ocl
+OCP
+OEM
+OFED
+OMP
+OMPT
+OMPX
+ONNX
+OpenCL
+opencl
+opencv
+OpenFabrics
+OpenGL
+OpenMP
+openmp
+openssl
+OpenVX
+optimizers
+os
+OSS
+OSU
+Pageable
+pageable
+passthrough
+PCI
+PCIe
+PeerDirect
+perfcounter
+Perfetto
+performant
+perl
+PIL
+PILImage
+PowerShell
+PnP
+pragma
+pre
+prebuilt
+precompiled
+prefetch
+prefetchable
+preprocess
+preprocessing
+preq
+prequantized
+prerequisites
+PRNG
+profiler
+protobuf
+PRs
+pseudorandom
+py
+PyPi
+PyTorch
+Qcycles
+quasirandom
+queueing
+Radeon
+RadeonOpenCompute
+RCCL
+rccl
+RDC
+rdc
+RDMA
+RDNA
+reformats
+RelWithDebInfo
+repos
+Req
+req
+resampling
+RST
+reStructuredText
+RHEL
+Rickle
+roadmap
+roc
+ROC
+RoCE
+rocAL
+rocALUTION
+rocalution
+rocBLAS
+rocblas
+rocclr
+ROCdbgapi
+rocFFT
+rocfft
+ROCgdb
+ROCk
+rocLIB
+rocm
+ROCm
+ROCmCC
+rocminfo
+rocMLIR
+ROCmSoftwarePlatform
+ROCmValidationSuite
+rocPRIM
+rocprim
+rocprof
+ROCProfiler
+rocprofiler
+ROCr
+rocr
+rocRAND
+rocrand
+rocSOLVER
+rocsolver
+rocSPARSE
+rocsparse
+roct
+rocThrust
+rocthrust
+ROCTracer
+roctracer
+rocWMMA
+RST
+runtime
+runtimes
+RW
+Ryzen
+SALU
+SBIOS
+SCA
+scalability
+SDK
+SDMA
+SDRAM
+SENDMSG
+sendmsg
+SENDMSG
+sendmsg
+SerDes
+serializers
+SGPR
+SGPRs
+SHA
+shader
+Shlens
+sigmoid
+SIGQUIT
+SIMD
+SIMDs
 SKU
 SKUs
-PowerShell
+skylake
+sL
+SLES
+sm
+SMEM
+SMI
+smi
+SMT
+softmax
+Spack
+spack
+SPI
+SQs
+SRAM
+SRAMECC
+src
+stochastically
+strided
+subdirectory
+subexpression
+subfolder
+subfolders
+supercomputing
+Supermicro
+SWE
+Szegedy
+tagram
+TCA
+TCC
+TCI
+TCIU
+TCP
+TCR
+TensorBoard
+TensorFlow
+TFLOPS
+tg
+th
+tmp
+ToC
+tokenize
+toolchain
+toolchains
+toolset
+toolsets
+TorchAudio
+TorchMIGraphX
+TorchScript
+TorchServe
+TorchVision
+torchvision
+tracebacks
+TransferBench
+TrapStatus
+txt
 UAC
-# pytorch_install
-kdb
-precompiled
-# gpu_os_support
-HWE
-el
+uarch
+ubuntu
+UC
+UCC
+UCX
+UIF
+Uncached
+uncached
+uncorrectable
+Unhandled
+uninstallation
+unsqueeze
+unstacking
+unswitching
+untrusted
+untuned
+USM
+UTCL
+UTIL
+utils
+VALU
+Vanhoucke
+VBIOS
+vdi
+vectorizable
+vectorization
+vectorize
+vectorized
+vectorizer
+vectorizes
+VGPR
+VGPRs
+vjxb
+vL
+VM
+VMEM
+VMWare
+VRAM
+VSIX
+VSkipped
+Vulkan
+walkthrough
+walkthroughs
+wavefront
+wavefronts
+WGP
+whitespaces
+Wojna
+workgroup
+Workgroups
+workgroups
+writeback
+Writebacks
+writebacks
+wrreq
+WX
+wzo
+Xeon
+XGMI
+Xnack
+XT
+Xteam
+XTX
+xz
+YAML
+yaml
+YML
+YModel
+ysvmadyb
+ZenDNN
+zypper
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,40 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cmake_minimum_required(VERSION 3.18.0)
+
+project(ROCm VERSION 5.7.1 LANGUAGES NONE)
+
+option(BUILD_DOCS "Build ROCm documentation" ON)
+
+include(GNUInstallDirs)
+
+# Adding default path cmake modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
+
+# Handle dependencies
+include(Dependencies)
+
+# Build docs
+if(BUILD_DOCS)
+  add_subdirectory(docs)
+endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,246 +1,94 @@
-# Contributing to ROCm Docs
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Contributing to ROCm">
+  <meta name="keywords" content="ROCm, contributing, contribute, maintainer, contributor">
+</head>

-AMD values and encourages the ROCm community to contribute to our code and
-documentation. This repository is focused on ROCm documentation and this
-contribution guide describes the recommend method for creating and modifying our
-documentation.
+# Contribute to ROCm

-While interacting with ROCm Documentation, we encourage you to be polite and
-respectful in your contributions, content or otherwise. Authors, maintainers of
-these docs act on good intentions and to the best of their knowledge.
-Keep that in mind while you engage. Should you have issues with contributing
-itself, refer to
-[discussions](https://github.com/RadeonOpenCompute/ROCm/discussions) on the
-GitHub repository.
+AMD values and encourages contributions to our code and documentation. If you want to contribute
+to our ROCm repositories, first review the following guidance. For documentation-specific information,
+see [Contributing to ROCm docs](https://rocm.docs.amd.com/en/latest/contribute/contribute-docs.html).

-## Supported Formats
+ROCm is a software stack made up of a collection of drivers, development tools, and APIs that enable
+GPU programming from low-level kernel to end-user applications. Because some of our components
+are inherited from external projects (such as
+[LLVM](https://github.com/ROCm/llvm-project) and
+[Kernel driver](https://github.com/ROCm/ROCK-Kernel-Driver)), these use
+project-specific contribution guidelines and workflow. Refer to their repositories for more information.
+All other ROCm components follow the workflow described in the following sections.

-Our documentation includes both markdown and rst files. Markdown is encouraged
-over rst due to the lower barrier to participation. GitHub flavored markdown is preferred
-for all submissions as it will render accurately on our GitHub repositories. For existing documentation,
-[MyST](https://myst-parser.readthedocs.io/en/latest/intro.html) markdown
-is used to implement certain features unsupported in GitHub markdown. This is
-not encouraged for new documentation. AMD will transition
-to stricter use of GitHub flavored markdown with a few caveats. ROCm documentation
-also uses [sphinx-design](https://sphinx-design.readthedocs.io/en/latest/index.html)
-in our markdown and rst files. We also will use breathe syntax for doxygen documentation
-in our markdown files. Other design elements for effective HTML rendering of the documents
-may be added to our markdown files. Please see
-[GitHub](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github)'s
-guide on writing and formatting on GitHub as a starting point.
+## Development workflow

-ROCm documentation adds additional requirements to markdown and rst based files
-as follows:
+ROCm uses GitHub to host code, collaborate, and manage version control. We use pull requests (PRs)
+for all changes within our repositories. We use
+[GitHub issues](https://github.com/ROCm/ROCm/issues) to track known issues, such as
+bugs.

- Level one headers are only used for page titles. There must be only one level
-  1 header per file for both Markdown and Restructured Text.
- Pass [markdownlint](https://github.com/markdownlint/markdownlint) check via
-  our automated github action on a Pull Request (PR).
+### Issue tracking

-## Filenames and folder structure
+Before filing a new issue, search the
+[existing issues](https://github.com/ROCm/ROCm/issues) to make sure your issue isn't
+already listed.

-Please use snake case for file names. Our documentation follows pitchfork for
-folder structure. All documentation is in /docs except for special files like
-the contributing guide in the / folder. All images used in the documentation are
-place in the /docs/data folder.
+General issue guidelines:

-## How to provide feedback for for ROCm documentation
+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
+  comment or post to provide additional details, such as how you reproduced this issue.
+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
+  You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
+  your issue as being the same as the existing issue, we'll close the duplicate.
+* If your issue doesn't exist, use the issue template to file a new issue.
+  * When filing an issue, be sure to provide as much information as possible, including script output so
+    we can collect information about your configuration. This helps reduce the time required to
+    reproduce your issue.
+  * Check your issue regularly, as we may require additional information to successfully reproduce the
+    issue.

-There are three standard ways to provide feedback for this repository.
+### Pull requests

-### Pull Request
+When you create a pull request, you should target the default branch.  Our repositories typically use the **develop** branch as the default integration branch.

-All contributions to ROCm documentation should arrive via the
-[GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
-targetting the develop branch of the repository. If you are unable to contribute
-via the GitHub Flow, feel free to email us. TODO, confirm email address.
+When creating a PR, use the following process. Note that each repository may include additional,
+project-specific steps. Refer to each repository's PR process for any additional steps.

-### GitHub Issue
+* Identify the issue you want to fix
+* Target the default branch (usually the **develop** branch) for integration
+* Ensure your code builds successfully
+* Each component has a suite of test cases to run; include the log of the successful test run in your PR
+* Do not break existing test cases
+* New functionality is only merged with new unit tests
+  * If your PR includes a new feature, you must provide an application or test so we can ensure that the
+    feature works and continues to be valid in the future
+* Tests must have good code coverage
+* Submit your PR and work with the reviewer or maintainer to get your PR approved
+* Once approved, the PR is brought onto internal CI systems and may be merged into the component
+  during our release cycle, as coordinated by the maintainer
+* We'll inform you once your change is committed

-Issues on existing or absent docs can be filed as [GitHub issues
-](https://github.com/RadeonOpenCompute/ROCm/issues).
+:::{important}
+By creating a PR, you agree to allow your contribution to be licensed under the
+terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
+licenses.
+:::

-### Email Feedback
+You can look up each license on the [ROCm licensing](https://rocm.docs.amd.com/en/latest/about/license.html) page.

-## Language and Style
+### New feature development

-Adopting Microsoft CPP-Docs guidelines for [Voice and Tone
-](https://github.com/MicrosoftDocs/cpp-docs/blob/main/styleguide/voice-tone.md).
+Use the [GitHub Discussion forum](https://github.com/ROCm/ROCm/discussions)
+(Ideas category) to propose new features. Our maintainers are happy to provide direction and
+feedback on feature development.

-ROCm documentation templates to be made public shortly. ROCm templates dictate
-the recommended structure and flow of the documentation. Guidelines on how to
-integrate figures, equations, and tables are all based off
-[MyST](https://myst-parser.readthedocs.io/en/latest/intro.html).
+### Documentation

-Font size and selection, page layout, white space control, and other formatting
-details are controlled via rocm-docs-core, sphinx extention. Please raise issues
-in rocm-docs-core for any formatting concerns and changes requested.
+Submit ROCm documentation changes to our
+[documentation repository](https://github.com/ROCm/ROCm). You must update
+documentation related to any new feature or API contribution.

-## Building Documentation
+Note that each ROCm project uses its own repository for documentation.

-While contributing, one may build the documentation locally on the command-line
-or rely on Continuous Integration for previewing the resulting HTML pages in a
-browser.
+## Future development workflow

-### Command line documentation builds
-
-Python versions known to build documentation:
-
- 3.8
-
-To build the docs locally using Python Virtual Environment (`venv`), execute the
-following commands from the project root:
-
-```sh
-python3 -mvenv .venv
-# Windows
-.venv/Scripts/python -m pip install -r docs/sphinx/requirements.txt
-.venv/Scripts/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
-# Linux
-.venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
-.venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
-```
-
-Then open up `_build/html/index.html` in your favorite browser.
-
-### Pull Requests documentation builds
-
-When opening a PR to the `develop` branch on GitHub, the page corresponding to
-the PR (`https://github.com/RadeonOpenCompute/ROCm/pull/<pr_number>`) will have
-a summary at the bottom. This requires the user be logged in to GitHub.
-
- There, click `Show all checks` and `Details` of the Read the Docs pipeline. It
-  will take you to `https://readthedocs.com/projects/advanced-micro-devices-rocm/
-  builds/<some_build_num>/`
-  - The list of commands shown are the exact ones used by CI to produce a render
-    of the documentation.
- There, click on the small blue link `View docs` (which is not the same as the
-  bigger button with the same text). It will take you to the built HTML site with
-  a URL of the form `https://
-  advanced-micro-devices-demo--<pr_number>.com.readthedocs.build/projects/alpha/en
-  /<pr_number>/`.
-
-### Build the docs using VS Code
-
-One can put together a productive environment to author documentation and also
-test it locally using VS Code with only a handful of extensions. Even though the
-extension landscape of VS Code is ever changing, here is one example setup that
-proved useful at the time of writing. In it, one can change/add content, build a
-new version of the docs using a single VS Code Task (or hotkey), see all errors/
-warnings emitted by Sphinx in the Problems pane and immediately see the
-resulting website show up on a locally serving web server.
-
-#### Configuring VS Code
-
-1. Install the following extensions:
-
-    - Python (ms-python.python)
-    - Live Server (ritwickdey.LiveServer)
-
-2. Add the following entries in `.vscode/settings.json`
-
-    ```json
-    {
-      "liveServer.settings.root": "/.vscode/build/html",
-      "liveServer.settings.wait": 1000,
-      "python.terminal.activateEnvInCurrentTerminal": true
-    }
-    ```
-
-    The settings in order are set for the following reasons:
-    - Sets the root of the output website for live previews. Must be changed
-      alongside the `tasks.json` command.
-    - Tells live server to wait with the update to give time for Sphinx to
-      regenerate site contents and not refresh before all is don. (Empirical value)
-    - Automatic virtual env activation is a nice touch, should you want to build
-      the site from the integrated terminal.
-
-3. Add the following tasks in `.vscode/tasks.json`
-
-    ```json
-    {
-      "version": "2.0.0",
-      "tasks": [
-        {
-          "label": "Build Docs",
-          "type": "process",
-          "windows": {
-            "command": "${workspaceFolder}/.venv/Scripts/python.exe"
-          },
-          "command": "${workspaceFolder}/.venv/bin/python3",
-          "args": [
-            "-m",
-            "sphinx",
-            "-j",
-            "auto",
-            "-T",
-            "-b",
-            "html",
-            "-d",
-            "${workspaceFolder}/.vscode/build/doctrees",
-            "-D",
-            "language=en",
-            "${workspaceFolder}/docs",
-            "${workspaceFolder}/.vscode/build/html"
-          ],
-          "problemMatcher": [
-            {
-              "owner": "sphinx",
-              "fileLocation": "absolute",
-              "pattern": {
-                "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):(\\d+):\\s+(WARNING|ERROR):\\s+(.*)$",
-                "file": 1,
-                "line": 2,
-                "severity": 3,
-                "message": 4
-              },
-            },
-            {
-              "owner": "sphinx",
-              "fileLocation": "absolute",
-              "pattern": {
-                "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):{1,2}\\s+(WARNING|ERROR):\\s+(.*)$",
-                "file": 1,
-                "severity": 2,
-                "message": 3
-              }
-            }
-          ],
-          "group": {
-            "kind": "build",
-            "isDefault": true
-          }
-        },
-      ],
-    }
-    ```
-
-    > (Implementation detail: two problem matchers were needed to be defined,
-    > because VS Code doesn't tolerate some problem information being potentially
-    > absent. While a single regex could match all types of errors, if a capture
-    > group remains empty (the line number doesn't show up in all warning/error
-    > messages) but the `pattern` references said empty capture group, VS Code
-    > discards the message completely.)
-
-4. Configure Python virtual environment (venv)
-
-    - From the Command Palette, run `Python: Create Environment`
-      - Select `venv` environment and the `docs/sphinx/requirements.txt` file.
-      _(Simply pressing enter while hovering over the file from the dropdown is
-      insufficient, one has to select the radio button with the 'Space' key if
-      using the keyboard.)_
-
-5. Build the docs
-
-    - Launch the default build Task using either:
-      - a hotkey _(default is 'Ctrl+Shift+B')_ or
-      - by issuing the `Tasks: Run Build Task` from the Command Palette.
-
-6. Open the live preview
-
-    - Navigate to the output of the site within VS Code, right-click on
-    `.vscode/build/html/index.html` and select `Open with Live Server`. The
-    contents should update on every rebuild without having to refresh the
-    browser.
-
-<!-- markdownlint-restore -->
+The current ROCm development workflow is GitHub-based. If, in the future, we change this platform,
+the tools and links may change. In this instance, we will update contribution guidelines accordingly.
--- a/GOVERNANCE.md
+++ b/GOVERNANCE.md
@@ -0,0 +1,60 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm governance model">
+  <meta name="keywords" content="ROCm, governance">
+</head>
+
+# Governance model
+
+ROCm is a software stack made up of a collection of drivers, development tools, and APIs that enable
+GPU programming from the low-level kernel to end-user applications.
+
+Components of ROCm that are inherited from external projects (such as
+[LLVM](https://github.com/ROCm/llvm-project) and
+[Kernel driver](https://github.com/ROCm/ROCK-Kernel-Driver)) follow their own
+governance model and code of conduct. All other components of ROCm are governed by this
+document.
+
+## Governance
+
+ROCm is led and managed by AMD.
+
+We welcome contributions from the community. Our maintainers review all proposed changes to
+ROCm.
+
+## Roles
+
+* **Maintainers** are responsible for their designated component and repositories.
+* **Contributors** provide input and suggest changes to existing components.
+
+### Maintainers
+
+Maintainers are appointed by AMD. They are able to approve changes and can commit to our
+repositories. They must use pull requests (PRs) for all changes.
+
+You can find the list of maintainers in the CODEOWNERS file of each repository. Code owners differ
+between repositories.
+
+### Contributors
+
+If you're not a maintainer, you're a contributor. We encourage the ROCm community to contribute in
+several ways:
+
+* Help other community members by posting questions or solutions on our
+  [GitHub discussion forums](https://github.com/ROCm/ROCm/discussions)
+* Notify us of a bugs by filing an issue report on
+  [GitHub Issues](https://github.com/ROCm/ROCm/issues)
+* Improve our documentation by submitting a PR to our
+  [repository](https://github.com/ROCm/ROCm/)
+* Improve the code base (for smaller or contained changes) by submitting a PR to the component
+* Suggest larger features by adding to the *Ideas* category in the
+  [GitHub discussion forum](https://github.com/ROCm/ROCm/discussions)
+
+For more information, refer to our [contribution guidelines](CONTRIBUTING.md).
+
+## Code of conduct
+
+To engage with any AMD ROCm component that is hosted on GitHub, you must abide by the
+[GitHub community guidelines](https://docs.github.com/en/site-policy/github-terms/github-community-guidelines)
+and the
+[GitHub community code of conduct](https://docs.github.com/en/site-policy/github-terms/github-community-code-of-conduct).
--- a/README.md
+++ b/README.md
@@ -1,44 +1,40 @@
-# AMD ROCm™ Platform
+# AMD ROCm Software

-ROCm™ is an open-source stack for GPU computation. ROCm is primarily Open-Source
-Software (OSS) that allows developers the freedom to customize and tailor their
-GPU software for their own needs while collaborating with a community of other
-developers, and helping each other find solutions in an agile, flexible, rapid
-and secure manner.
+ROCm is an open-source stack, composed primarily of open-source software, designed for graphics
+processing unit (GPU) computation. ROCm consists of a collection of drivers, development tools, and
+APIs that enable GPU programming from low-level kernel to end-user applications.

-ROCm is a collection of drivers, development tools and APIs enabling GPU
-programming from the low-level kernel to end-user applications. ROCm is powered
-by AMD’s Heterogeneous-computing Interface for Portability (HIP), an OSS C++ GPU
-programming environment and its corresponding runtime. HIP allows ROCm
-developers to create portable applications on different platforms by deploying
-code on a range of platforms, from dedicated gaming GPUs to exascale HPC
-clusters. ROCm supports programming models such as OpenMP and OpenCL, and
-includes all the necessary OSS compilers, debuggers and libraries. ROCm is fully
-integrated into ML frameworks such as PyTorch and TensorFlow. ROCm can be
-deployed in many ways, including through the use of containers such as Docker,
-Spack, and your own build from source.
+With ROCm, you can customize your GPU software to meet your specific needs. You can develop,
+collaborate, test, and deploy your applications in a free, open source, integrated, and secure software
+ecosystem. ROCm is particularly well-suited to GPU-accelerated high-performance computing (HPC),
+artificial intelligence (AI), scientific computing, and computer aided design (CAD).

-ROCm’s goal is to allow our users to maximize their GPU hardware investment.
-ROCm is designed to help develop, test and deploy GPU accelerated HPC, AI,
-scientific computing, CAD, and other applications in a free, open-source,
-integrated and secure software ecosystem.
+ROCm is powered by AMD’s
+[Heterogeneous-computing Interface for Portability (HIP)](https://github.com/ROCm-Developer-Tools/HIP),
+an open-source software C++ GPU programming environment and its corresponding runtime. HIP
+allows ROCm developers to create portable applications on different platforms by deploying code on a
+range of platforms, from dedicated gaming GPUs to exascale HPC clusters.

-This repository contains the manifest file for ROCm™ releases, changelogs, and
-release information. The file default.xml contains information for all
-repositories and the associated commit used to build the current ROCm release.
+ROCm supports programming models, such as OpenMP and OpenCL, and includes all necessary open
+source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
+(ML) frameworks, such as PyTorch and TensorFlow.

-The default.xml file uses the repo Manifest format.
+## ROCm documentation

-The develop branch of this repository contains content for the next
-ROCm release.
+This repository contains the manifest file for ROCm releases, changelogs, and release information.

-## ROCm Documentation
+The `default.xml` file contains information for all repositories and the associated commit used to build
+the current ROCm release; `default.xml` uses the Manifest Format repository.

-ROCm Documentation is available online at
-[rocm.docs.amd.com](https://rocm.docs.amd.com). Source code for the documenation
-is located in the docs folder of most repositories that are part of ROCm.
+Source code for our documentation is located in the `/docs` folder of most ROCm repositories. The
+`develop` branch of our repositories contains content for the next ROCm release.

-### How to build documentation via Sphinx
+The ROCm documentation homepage is [rocm.docs.amd.com](https://rocm.docs.amd.com).
+
+### Building our documentation
+
+For a quick-start build, use the following code. For more options and detail, refer to
+[Building documentation](./contribute/building.md).

 ```bash
 cd docs
@@ -48,7 +44,15 @@ pip3 install -r sphinx/requirements.txt
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```

-## Older ROCm™ Releases
+Alternatively, CMake build is supported.

-For release information for older ROCm™ releases, refer to
+```bash
+cmake -B build
+
+cmake --build build --target=doc
+```
+
+## Older ROCm releases
+
+For release information for older ROCm releases, refer to the
 [CHANGELOG](./CHANGELOG.md).
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,582 +1,248 @@
-# Release Notes
-<!-- Do not edit this file! This file is autogenerated with -->
-<!--   tools/autotag/tag_script.py                          -->
+# Release notes for AMD ROCm™ 6.0

-<!-- Disable lints since this is an auto-generated file.    -->
-<!-- markdownlint-disable blanks-around-headers             -->
-<!-- markdownlint-disable no-duplicate-header               -->
-<!-- markdownlint-disable no-blanks-blockquote              -->
-<!-- markdownlint-disable ul-indent                         -->
-<!-- markdownlint-disable no-trailing-spaces                -->
+ROCm 6.0 is a major release with new performance optimizations, expanded frameworks and library
+support, and improved developer experience. This includes initial enablement of the AMD Instinct™
+MI300 series. Future releases will further enable and optimize this new platform. Key features include:

-<!-- spellcheck-disable -->
+* Improved performance in areas like lower precision math and attention layers.
+* New hipSPARSELt library accelerates AI workloads via AMD's sparse matrix core technique.
+* Upstream support is now available for popular AI frameworks like TensorFlow, JAX, and PyTorch.
+* New support for libraries, such as DeepSpeed, ONNX-RT, and CuPy.
+* Prepackaged HPC and AI containers on AMD Infinity Hub, with improved documentation and
+  tutorials on the [AMD ROCm Docs](https://rocm.docs.amd.com) site.
+* Consolidated developer resources and training on the new
+  [AMD ROCm Developer Hub](https://www.amd.com/en/developer/resources/rocm-hub.html).

-The release notes for the ROCm platform.
+The following section provide a release overview for ROCm 6.0. For additional details, you can refer to
+the [Changelog](https://rocm.docs.amd.com/en/develop/about/CHANGELOG.html). We list known
+issues on [GitHub](https://github.com/ROCm/ROCm/issues).

-------------------
+## OS and GPU support changes

-## ROCm 5.6.0
-<!-- markdownlint-disable first-line-h1 -->
-<!-- markdownlint-disable no-duplicate-header -->
-<!-- markdownlint-disable header-increment -->
-#### Release Highlights
+ROCm 6.0 enables the use of MI300A and MI300X Accelerators with a limited operating systems
+support. Future releases will add additional OS's to match our general offering.

-ROCm 5.6 consists of several AI software ecosystem improvements to our fast-growing user base. A few examples include:
+| Operating Systems | MI300A | MI300X |
+|:---:|:---:|:---:|
+| Ubuntu 22.04.3 | Supported | Supported |
+| RHEL 8.9 | Supported |  |
+| SLES15 SP5 | Supported |  |

- New documentation portal at https://rocm.docs.amd.com
- Ongoing software enhancements for LLMs, ensuring full compliance with the HuggingFace unit test suite
- OpenAI Triton, CuPy, HIP Graph support, and many other library performance enhancements
- Improved ROCm deployment and development tools, including CPU-GPU (rocGDB) debugger, profiler, and docker containers
- New pseudorandom generators are available in rocRAND.  Added support for half-precision transforms in hipFFT/rocFFT.  Added LU refactorization and linear system solver for sparse matrices in rocSOLVER.  
+For older generations of supported Instinct products we've added the following operating systems:

-#### OS and GPU Support Changes
+* RHEL 9.3
+* RHEL 8.9

- SLES15 SP5 support was added this release. SLES15 SP3 support was dropped.
- AMD Instinct MI50, Radeon Pro VII, and Radeon VII products (collectively referred to as gfx906 GPUs) will be entering the maintenance mode starting Q3 2023. This will be aligned with ROCm 5.7 GA release date.
-  - No new features and performance optimizations will be supported for the gfx906 GPUs beyond ROCm 5.7
-  - Bug fixes / critical security patches will continue to be supported for the gfx906 GPUs till Q2 2024 (End of Maintenance [EOM])(will be aligned with the closest ROCm release)
-  - Bug fixes during the maintenance will be made to the next ROCm point release
-  - Bug fixes will not be back ported to older ROCm releases for this SKU
-  - Distro / Operating system updates will continue as per the ROCm release cadence for gfx906 GPUs till EOM.
+Note: For ROCm 6.2 and beyond, we've planned for end-of-support (EoS) for the following operating
+systems:

-#### AMDSMI CLI 23.0.0.4
+* Ubuntu 20.04.5
+* SLES 15 SP4
+* RHEL/CentOS 7.9

-##### Added
+## New ROCm meta package

- AMDSMI CLI tool enabled for Linux Bare Metal & Guest
+We've added a new ROCm meta package for easy installation of all ROCm core packages, tools, and
+libraries. For example, the following command will install the full ROCm package: `apt-get install rocm`
+(Ubuntu), or `yum install rocm` (RHEL).

- Package: amd-smi-lib
- 
-##### Known Issues
+## Filesystem Hierarchy Standard

- not all Error Correction Code (ECC) fields are currently supported
+ROCm 6.0 fully adopts the Filesystem Hierarchy Standard (FHS) reorganization goals. We've removed
+the backward compatibility support for old file locations.

- RHEL 8 & SLES 15 have extra install steps
+## Compiler location change

-#### Kernel Modules (DKMS)
+* The installation path of LLVM has been changed from `/opt/rocm-<rel>/llvm` to
+  `/opt/rocm-<rel>/lib/llvm`. For backward compatibility, a symbolic link is provided to the old
+  location and will be removed in a future release.
+* The installation path of the device library bitcode has changed from `/opt/rocm-<rel>/amdgcn` to
+  `/opt/rocm-<rel>/lib/llvm/lib/clang/<ver>/lib/amdgcn`. For backward compatibility, a symbolic link
+  is provided and will be removed in a future release.

-##### Fixes
+## Documentation

- Stability fix for multi GPU system reproducilble via ROCm_Bandwidth_Test as reported in [Issue 2198](https://github.com/RadeonOpenCompute/ROCm/issues/2198).
+CMake support has been added for documentation in the
+[ROCm repository](https://github.com/RadeonOpenCompute/ROCm).

-#### HIP 5.6 (For ROCm 5.6)
+## AMD Instinct™ MI50 end-of-support notice

-##### Optimizations
+AMD Instinct MI50, Radeon Pro VII, and Radeon VII products (collectively gfx906 GPUs) enters
+maintenance mode in ROCm 6.0.

- Consolidation of hipamd, rocclr and OpenCL projects in clr
- Optimized lock for graph global capture mode
+As outlined in [5.6.0](https://rocm.docs.amd.com/en/docs-5.6.0/release.html), ROCm 5.7 was the
+final release for gfx906 GPUs in a fully supported state.

-##### Added
+  * Henceforth, no new features and performance optimizations will be supported for the gfx906 GPUs.
+  * Bug fixes and critical security patches will continue to be supported for the gfx906 GPUs until Q2
+    2024 (end of maintenance \[EOM] will be aligned with the closest ROCm release).
+  * Bug fixes will be made up to the next ROCm point release.
+  * Bug fixes will not be backported to older ROCm releases for gfx906.
+  * Distribution and operating system updates will continue per the ROCm release cadence for gfx906
+    GPUs until EOM.

- Added hipRTC support for amd_hip_fp16
- Added hipStreamGetDevice implementation to get the device associated with the stream
- Added HIP_AD_FORMAT_SIGNED_INT16 in hipArray formats
- hipArrayGetInfo for getting information about the specified array
- hipArrayGetDescriptor for getting 1D or 2D array descriptor
- hipArray3DGetDescriptor to get 3D array descriptor
+## ROCm projects

-##### Changed
+The following sections contains project-specific release notes for ROCm 6.0. For additional details, you
+can refer to the [Changelog](https://rocm.docs.amd.com/en/develop/about/CHANGELOG.html).

- hipMallocAsync to return success for zero size allocation to match hipMalloc
- Separation of hipcc perl binaries from HIP project to hipcc project. hip-devel package depends on newly added hipcc package
- Consolidation of hipamd, ROCclr, and OpenCL repositories into a single repository called clr. Instructions are updated to build HIP from sources in the HIP Installation guide
- Removed hipBusBandwidth and hipCommander samples from hip-tests
+### AMD SMI

-##### Fixed
+* **Integrated the E-SMI (EPYC-SMI) library**.
+    You can now query CPU-related information directly through AMD SMI. Metrics include power,
+    energy, performance, and other system details.

- Fixed regression in hipMemCpyParam3D when offset is applied
+* **Added support for gfx942 metrics**.
+    You can now query MI300 device metrics to get real-time information. Metrics include power,
+    temperature, energy, and performance.

-##### Known Issues
+### HIP

- Limited testing on xnack+ configuration
-  - Multiple HIP tests failures (gpuvm fault or hangs)
- hipSetDevice and hipSetDeviceFlags APIs return hipErrorInvalidDevice instead of hipErrorNoDevice, on a system without GPU
- Known memory leak when code object files are loaded/unloaded via hipModuleLoad/hipModuleUnload APIs. Issue will be fixed in a future ROCm release
+* **New features to improve resource interoperability**.
+  * For external resource interoperability, we've added new structs and enums.
+  * We've added new members to HIP struct `hipDeviceProp_t` for surfaces, textures, and device
+    identifiers.

-##### Upcoming changes in future release
+* **Changes impacting backward compatibility**.
+    There are several changes impacting backward compatibility: we changed some struct members and
+    some enum values, and removed some deprecated flags. For additional information, please refer to
+    the Changelog.

- Removal of gcnarch from hipDeviceProp_t structure
- Addition of new fields in hipDeviceProp_t structure
-  - maxTexture1D
-  - maxTexture2D
-  - maxTexture1DLayered
-  - maxTexture2DLayered
-  - sharedMemPerMultiprocessor
-  - deviceOverlap
-  - asyncEngineCount
-  - surfaceAlignment
-  - unifiedAddressing
-  - computePreemptionSupported
-  - uuid
- Removal of deprecated code
-  - hip-hcc codes from hip code tree
- Correct hipArray usage in HIP APIs such as hipMemcpyAtoH and hipMemcpyHtoA
- HIPMEMCPY_3D fields correction (unsigned int -> size_t)
- Renaming of 'memoryType' in hipPointerAttribute_t structure to 'type'
+### hipCUB

-#### ROCgdb-13 (For ROCm 5.6.0)
+* **Additional CUB API support**.
+    The hipCUB backend is updated to CUB and Thrust 2.1.

-##### Optimized
+### HIPIFY

- Improved performances when handling the end of a process with a large number of threads.
+* **Enhanced CUDA2HIP document generation**.
+    API versions are now listed in the CUDA2HIP documentation. To see if the application binary
+    interface (ABI) has changed, refer to the
+    [*C* column](https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html)
+    in our API documentation.

-Known Issues
+* **Hipified rocSPARSE**.
+    We've implemented support for the direct hipification of additional cuSPARSE APIs into rocSPARSE
+    APIs under the `--roc` option. This covers a major milestone in the roadmap towards complete
+    cuSPARSE-to-rocSPARSE hipification.

- On certain configurations, ROCgdb can show the following warning message:
+### hipRAND

-  `warning: Probes-based dynamic linker interface failed. Reverting to original interface.`
+* **Official release**.
+    hipRAND is now a *standalone project*--it's no longer available as a submodule for rocRAND.

-  This does not affect ROCgdb's functionalities.
+### hipTensor

-#### ROCprofiler (For ROCm 5.6.0)
+* **Added architecture support**.
+    We've added contraction support for gfx942 architectures, and f32 and f64 data
+    types.

-In ROCm 5.6 the `rocprofilerv1` and `rocprofilerv2` include and library files of
-ROCm 5.5 are split into separate files. The `rocmtools` files that were
-deprecated in ROCm 5.5 have been removed.
+* **Upgraded testing infrastructure**.
+    hipTensor will now support dynamic parameter configuration with input YAML config.

-  | ROCm 5.6        | rocprofilerv1                       | rocprofilerv2                          |
-  |-----------------|-------------------------------------|----------------------------------------|
-  | **Tool script** | `bin/rocprof`                       | `bin/rocprofv2`                        |
-  | **API include** | `include/rocprofiler/rocprofiler.h` | `include/rocprofiler/v2/rocprofiler.h` |
-  | **API library** | `lib/librocprofiler.so.1`           | `lib/librocprofiler.so.2`              |
+### MIGraphX

-The ROCm Profiler Tool that uses `rocprofilerV1` can be invoked using the
-following command:
+* **Added TorchMIGraphX**.
+    We introduced a Dynamo backend for Torch, which allows PyTorch to use MIGraphX directly
+    without first requiring a model to be converted to the ONNX model format. With a single line of
+    code, PyTorch users can utilize the performance and quantization benefits provided by MIGraphX.

-```sh
-$ rocprof …
-```
+* **Boosted overall performance with rocMLIR**.
+    We've integrated the rocMLIR library for ROCm-supported RDNA and CDNA GPUs. This
+    technology provides MLIR-based convolution and GEMM kernel generation.

-To write a custom tool based on the `rocprofilerV1` API do the following:
+* **Added INT8 support across the MIGraphX portfolio**.
+    We now support the INT8 data type. MIGraphX can perform the quantization or ingest
+    prequantized models. INT8 support extends to the MIGraphX execution provider for ONNX Runtime.

-```C
-main.c:
-#include <rocprofiler/rocprofiler.h> // Use the rocprofilerV1 API
-int main() {
-  // Use the rocprofilerV1 API
-  return 0;
-}
-```
+### ROCgdb

-This can be built in the following manner:
+* **Added support for additional GPU architectures**.
+  * Navi 3 series: gfx1100, gfx1101, and gfx1102.
+  * MI300 series: gfx942.

-```sh
-$ gcc main.c -I/opt/rocm-5.6.0/include -L/opt/rocm-5.6.0/lib -lrocprofiler64
-```
+### rocm-smi-lib

-The resulting `a.out` will depend on
-`/opt/rocm-5.6.0/lib/librocprofiler64.so.1`.
+* **Improved accessibility to GPU partition nodes**.
+    You can now view, set, and reset the compute and memory partitions. You'll also get notifications of
+    a GPU busy state, which helps you avoid partition set or reset failure.

-The ROCm Profiler that uses `rocprofilerV2` API can be invoked using the
-following command:
+* **Upgraded GPU metrics version 1.4**.
+    The upgraded GPU metrics binary has an improved metric version format with a content version
+    appended to it. You can read each metric within the binary without the full `rsmi_gpu_metric_t` data
+    structure.

-```sh
-$ rocprofv2 …
-```
+* **Updated GPU index sorting**.
+    We made GPU index sorting consistent with other ROCm software tools by optimizing it to use
+    `Bus:Device.Function` (BDF) instead of the card number.

-To write a custom tool based on the `rocprofilerV2` API do the following:
+### ROCm Compiler

-```C
-main.c:
-#include <rocprofiler/v2/rocprofiler.h> // Use the rocprofilerV2 API
-int main() {
-  // Use the rocprofilerV2 API
-  return 0;
-}
-```
+* **Added kernel argument optimization on gfx942**.
+    With the new feature, you can preload kernel arguments into Scalar General-Purpose Registers
+    (SGPRs) rather than pass them in memory. This feature is enabled with a compiler option, which also
+    controls the number of arguments to pass in SGPRs. For more information, see:
+    [https://llvm.org/docs/AMDGPUUsage.html#preloaded-kernel-arguments](https://llvm.org/docs/AMDGPUUsage.html#preloaded-kernel-arguments)

-This can be built in the following manner:
+* **Improved register allocation at -O0**.
+    We've improved the register allocator used at -O0 to avoid compiler crashes (when the signature is
+    'ran out of registers during register allocation').

-```sh
-$ gcc main.c -I/opt/rocm-5.6.0/include -L/opt/rocm-5.6.0/lib -lrocprofiler64-v2
-```
+* **Improved generation of debug information**.
+    We've improved compile time when generating debug information for certain corner cases. We've
+    also improved the compiler to eliminate compiler crashes when generating debug information.

-The resulting `a.out` will depend on
-`/opt/rocm-5.6.0/lib/librocprofiler64.so.2`.
+### ROCmValidationSuite

-##### Optimized
+* **Added GPU and operating system support**.
+    We added support for MI300X GPU in GPU Stress Test (GST).

- Improved Test Suite
+### Roc Profiler

-##### Added
+* **Added option to specify desired Roc Profiler version**.
+    You can now use rocProfV1 or rocProfV2 by specifying your desired version, as the legacy rocProf
+    (`rocprofv1`) provides the option to use the latest version (`rocprofv2`).

- 'end_time' need to be disabled in roctx_trace.txt
+* **Automated the ISA dumping process by Advance Thread Tracer**.
+    Advance Thread Tracer (ATT) no longer depends on user-supplied Instruction Set Architecture (ISA)
+    and compilation process (using ``hipcc --save-temps``) to dump ISA from the running kernels.

-##### Fixed
+* **Added ATT support for parallel kernels**.
+    The automatic ISA dumping process also helps ATT successfully parse multiple kernels running in
+    parallel, and provide cycle-accurate occupancy information for multiple kernels at the same time.

- rocprof in ROcm/5.4.0 gpu selector broken.
- rocprof in ROCm/5.4.1 fails to generate kernel info.
- rocprof clobbers LD_PRELOAD.
+### ROCr

-### Library Changes in ROCM 5.6.0
+* **Support for SDMA link aggregation**.
+    If multiple XGMI links are available when making SDMA copies between GPUs, the copy is
+    distributed over multiple links to increase peak bandwidth.

-| Library | Version |
-|---------|---------|
-| hipBLAS |  ⇒ [1.0.0](https://github.com/ROCmSoftwarePlatform/hipBLAS/releases/tag/rocm-5.6.0) |
-| hipCUB |  ⇒ [2.13.1](https://github.com/ROCmSoftwarePlatform/hipCUB/releases/tag/rocm-5.6.0) |
-| hipFFT |  ⇒ [1.0.12](https://github.com/ROCmSoftwarePlatform/hipFFT/releases/tag/rocm-5.6.0) |
-| hipSOLVER |  ⇒ [1.8.0](https://github.com/ROCmSoftwarePlatform/hipSOLVER/releases/tag/rocm-5.6.0) |
-| hipSPARSE |  ⇒ [2.3.6](https://github.com/ROCmSoftwarePlatform/hipSPARSE/releases/tag/rocm-5.6.0) |
-| MIOpen |  ⇒ [2.19.0](https://github.com/ROCmSoftwarePlatform/MIOpen/releases/tag/rocm-5.6.0) |
-| rccl |  ⇒ [2.15.5](https://github.com/ROCmSoftwarePlatform/rccl/releases/tag/rocm-5.6.0) |
-| rocALUTION |  ⇒ [2.1.9](https://github.com/ROCmSoftwarePlatform/rocALUTION/releases/tag/rocm-5.6.0) |
-| rocBLAS |  ⇒ [3.0.0](https://github.com/ROCmSoftwarePlatform/rocBLAS/releases/tag/rocm-5.6.0) |
-| rocFFT |  ⇒ [1.0.23](https://github.com/ROCmSoftwarePlatform/rocFFT/releases/tag/rocm-5.6.0) |
-| rocm-cmake |  ⇒ [0.9.0](https://github.com/RadeonOpenCompute/rocm-cmake/releases/tag/rocm-5.6.0) |
-| rocPRIM |  ⇒ [2.13.0](https://github.com/ROCmSoftwarePlatform/rocPRIM/releases/tag/rocm-5.6.0) |
-| rocRAND |  ⇒ [2.10.17](https://github.com/ROCmSoftwarePlatform/rocRAND/releases/tag/rocm-5.6.0) |
-| rocSOLVER |  ⇒ [3.22.0](https://github.com/ROCmSoftwarePlatform/rocSOLVER/releases/tag/rocm-5.6.0) |
-| rocSPARSE |  ⇒ [2.5.2](https://github.com/ROCmSoftwarePlatform/rocSPARSE/releases/tag/rocm-5.6.0) |
-| rocThrust |  ⇒ [2.18.0](https://github.com/ROCmSoftwarePlatform/rocThrust/releases/tag/rocm-5.6.0) |
-| rocWMMA |  ⇒ [1.1.0](https://github.com/ROCmSoftwarePlatform/rocWMMA/releases/tag/rocm-5.6.0) |
-| Tensile |  ⇒ [4.37.0](https://github.com/ROCmSoftwarePlatform/Tensile/releases/tag/rocm-5.6.0) |
+### rocThrust

-#### hipBLAS 1.0.0
+* **Added Thrust 2.1 API support**.
+    rocThrust backend is updated to Thrust and CUB 2.1.

-hipBLAS 1.0.0 for ROCm 5.6.0
+### rocWMMA

-##### Changed
+* **Added new architecture support**.
+    We added support for gfx942 architectures.

- added const qualifier to hipBLAS functions (swap, sbmv, spmv, symv, trsm) where missing
+* **Added data type support**.
+    We added support for f8, bf8, xf32 data types on supporting architectures, and for bf16 in the HIP RTC
+    environment.

-##### Removed
+* **Added support for the PyTorch kernel plugin**.
+    We added awareness of `__HIP_NO_HALF_CONVERSIONS__` to support PyTorch users.

- removed support for deprecated hipblasInt8Datatype_t enum
- removed support for deprecated hipblasSetInt8Datatype and hipblasGetInt8Datatype functions
+### TransferBench (beta)

-##### Deprecated
+* **Improved ordering control**.
+    You can now set the thread block size (`BLOCK_SIZE`) and the thread block order (`BLOCK_ORDER`)
+    in which thread blocks from different transfers are run when using a single stream.

- in-place trmm is deprecated. It will be replaced by trmm which includes both in-place and
-  out-of-place functionality
+* **Added comprehensive reports**.
+    We modified individual transfers to report X Compute Clusters (XCC) ID when `SHOW_ITERATIONS`
+    is set to 1.

-#### hipCUB 2.13.1
-
-hipCUB 2.13.1 for ROCm 5.6.0
-
-##### Added
-
- Benchmarks for `BlockShuffle`, `BlockLoad`, and `BlockStore`.
-
-##### Changed
-
- CUB backend references CUB and Thrust version 1.17.2.
- Improved benchmark coverage of `BlockScan` by adding `ExclusiveScan`, benchmark coverage of `BlockRadixSort` by adding `SortBlockedToStriped`, and benchmark coverage of `WarpScan` by adding `Broadcast`.
- Updated `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core).
-
-##### Known Issues
-
- `BlockRadixRankMatch` is currently broken under the rocPRIM backend.
- `BlockRadixRankMatch` with a warp size that does not exactly divide the block size is broken under the CUB backend.
-
-#### hipFFT 1.0.12
-
-hipFFT 1.0.12 for ROCm 5.6.0
-
-##### Added
-
- Implemented the hipfftXtMakePlanMany, hipfftXtGetSizeMany, hipfftXtExec APIs, to allow requesting half-precision transforms.
-
-##### Changed
-
- Added --precision argument to benchmark/test clients.  --double is still accepted but is deprecated as a method to request a double-precision transform.
-
-#### hipSOLVER 1.8.0
-
-hipSOLVER 1.8.0 for ROCm 5.6.0
-
-##### Added
-
- Added compatibility API with hipsolverRf prefix
-
-#### hipSPARSE 2.3.6
-
-hipSPARSE 2.3.6 for ROCm 5.6.0
-
-##### Added
-
- Added SpGEMM algorithms
-
-##### Changed
-
- For hipsparseXbsr2csr and hipsparseXcsr2bsr, blockDim == 0 now returns HIPSPARSE_STATUS_INVALID_SIZE
-
-#### MIOpen 2.19.0
-
-MIOpen 2.19.0 for ROCm 5.6.0
-
-##### Added
-
- ROCm 5.5 support for gfx1101 (Navi32)
-
-##### Changed
-
- Tuning results for MLIR on ROCm 5.5
- Bumping MLIR commit to 5.5.0 release tag
-
-##### Fixed
-
- Fix 3d convolution Host API bug
- [HOTFIX][MI200][FP16] Disabled ConvHipImplicitGemmBwdXdlops when FP16_ALT is required.
-
-#### rccl 2.15.5
-
-RCCL 2.15.5 for ROCm 5.6.0
-
-##### Changed
-
- Compatibility with NCCL 2.15.5
- Unit test executable renamed to rccl-UnitTests
-
-##### Added
-
- HW-topology aware binary tree implementation
- Experimental support for MSCCL
- New unit tests for hipGraph support
- NPKit integration
-
-##### Fixed
-
- rocm-smi ID conversion
- Support for HIP_VISIBLE_DEVICES for unit tests
- Support for p2p transfers to non (HIP) visible devices
-
-##### Removed
-
- Removed TransferBench from tools.  Exists in standalone repo: https://github.com/ROCmSoftwarePlatform/TransferBench
-
-#### rocALUTION 2.1.9
-
-rocALUTION 2.1.9 for ROCm 5.6.0
-
-##### Improved
-
- Fixed synchronization issues in level 1 routines
-
-#### rocBLAS 3.0.0
-
-rocBLAS 3.0.0 for ROCm 5.6.0
-
-##### Optimizations
-
- Improved performance of Level 2 rocBLAS GEMV on gfx90a GPU for non-transposed problems having small matrices and larger batch counts. Performance enhanced for problem sizes when m and n &lt;= 32 and batch_count &gt;= 256.
- Improved performance of rocBLAS syr2k for single, double, and double-complex precision, and her2k for double-complex precision. Slightly improved performance for general sizes on gfx90a.
-
-##### Added
-
- Added bf16 inputs and f32 compute support to Level 1 rocBLAS Extension functions axpy_ex, scal_ex and nrm2_ex.
-
-##### Deprecated
-
- trmm inplace is deprecated. It will be replaced by trmm that has both inplace and out-of-place functionality
- rocblas_query_int8_layout_flag() is deprecated and will be removed in a future release
- rocblas_gemm_flags_pack_int8x4 enum is deprecated and will be removed in a future release
- rocblas_set_device_memory_size() is deprecated and will be replaced by a future function rocblas_increase_device_memory_size()
- rocblas_is_user_managing_device_memory() is deprecated and will be removed in a future release
-
-##### Removed
-
- is_complex helper was deprecated and now removed.  Use rocblas_is_complex instead.
- The enum truncate_t and the value truncate was deprecated and now removed from. It was replaced by rocblas_truncate_t and rocblas_truncate, respectively.
- rocblas_set_int8_type_for_hipblas was deprecated and is now removed.
- rocblas_get_int8_type_for_hipblas was deprecated and is now removed.
-
-##### Dependencies
-
- build only dependency on python joblib added as used by Tensile build
- fix for cmake install on some OS when performed by install.sh -d --cmake_install
-
-##### Fixed
-
- make trsm offset calculations 64 bit safe
-
-##### Changed
-
- refactor rotg test code
-
-#### rocFFT 1.0.23
-
-rocFFT 1.0.23 for ROCm 5.6.0
-
-##### Added
-
- Implemented half-precision transforms, which can be requested by passing rocfft_precision_half to rocfft_plan_create.
- Implemented a hierarchical solution map which saves how to decompose a problem and the kernels to be used.
- Implemented a first version of offline-tuner to support tuning kernels for C2C/Z2Z problems.
-
-##### Changed
-
- Replaced std::complex with hipComplex data types for data generator.
- FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example).
- Added --precision argument to benchmark/test clients.  --double is still accepted but is deprecated as a method to request a double-precision transform.
-
-##### Fixed
-
- Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure.
-
-#### rocm-cmake 0.9.0
-
-rocm-cmake 0.9.0 for ROCm 5.6.0
-
-##### Added
-
- Added the option ROCM_HEADER_WRAPPER_WERROR
-    - Compile-time C macro in the wrapper headers causes errors to be emitted instead of warnings.
-    - Configure-time CMake option sets the default for the C macro.
-
-#### rocPRIM 2.13.0
-
-rocPRIM 2.13.0 for ROCm 5.6.0
-
-##### Added
-
- New block level `radix_rank` primitive.
- New block level `radix_rank_match` primitive.
- Added a stable block sorting implementation. This be used with `block_sort` by using the `block_sort_algorithm::stable_merge_sort` algorithm.
-
-##### Changed
-
- Improved the performance of `block_radix_sort` and `device_radix_sort`.
- Improved the performance of `device_merge_sort`.
- Updated `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core). Contributed by: [v01dXYZ](https://github.com/v01dXYZ).
-
-##### Known Issues
-
- Disabled GPU error messages relating to incorrect warp operation usage with Navi GPUs on Windows, due to GPU printf performance issues on Windows.
- When `ROCPRIM_DISABLE_LOOKBACK_SCAN` is set, `device_scan` fails for input sizes bigger than `scan_config::size_limit`, which defaults to `std::numeric_limits&lt;unsigned int&gt;::max()`.
-
-#### rocRAND 2.10.17
-
-rocRAND 2.10.17 for ROCm 5.6.0
-
-##### Added
-
- MT19937 pseudo random number generator based on M. Matsumoto and T. Nishimura, 1998, Mersenne Twister: A 623-dimensionally equidistributed uniform pseudorandom number generator.
- New benchmark for the device API using Google Benchmark, `benchmark_rocrand_device_api`, replacing `benchmark_rocrand_kernel`. `benchmark_rocrand_kernel` is deprecated and will be removed in a future version. Likewise, `benchmark_curand_host_api` is added to replace `benchmark_curand_generate` and `benchmark_curand_device_api` is added to replace `benchmark_curand_kernel`.
- experimental HIP-CPU feature
- ThreeFry pseudorandom number generator based on Salmon et al., 2011, &#34;Parallel random numbers: as easy as 1, 2, 3&#34;.
-
-##### Changed
-
- Python 2.7 is no longer officially supported.
-
-#### rocSOLVER 3.22.0
-
-rocSOLVER 3.22.0 for ROCm 5.6.0
-
-##### Added
-
- LU refactorization for sparse matrices
-    - CSRRF_ANALYSIS
-    - CSRRF_SUMLU
-    - CSRRF_SPLITLU
-    - CSRRF_REFACTLU
- Linear system solver for sparse matrices
-    - CSRRF_SOLVE
- Added type `rocsolver_rfinfo` for use with sparse matrix routines
-
-##### Optimized
-
- Improved the performance of BDSQR and GESVD when singular vectors are requested
-
-##### Fixed
-
- BDSQR and GESVD should no longer hang when the input contains `NaN` or `Inf`
-
-#### rocSPARSE 2.5.2
-
-rocSPARSE 2.5.2 for ROCm 5.6.0
-
-##### Improved
-
- Fixed a memory leak in csritsv
- Fixed a bug in csrsm and bsrsm
-
-#### rocThrust 2.18.0
-
-rocThrust 2.18.0 for ROCm 5.6.0
-
-##### Fixed 
-
- `lower_bound`, `upper_bound`, and `binary_search` failed to compile for certain types.
-
-##### Changed
-
- Updated `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core).
-
-#### rocWMMA 1.1.0
-
-rocWMMA 1.1.0 for ROCm 5.6.0
-
-##### Added
-
- Added cross-lane operation backends (Blend, Permute, Swizzle and Dpp)
- Added GPU kernels for rocWMMA unit test pre-process and post-process operations (fill, validation)
- Added performance gemm samples for half, single and double precision
- Added rocWMMA cmake versioning
- Added vectorized support in coordinate transforms
- Included ROCm smi for runtime clock rate detection
- Added fragment transforms for transpose and change data layout
-
-##### Changed
-
- Default to GPU rocBLAS validation against rocWMMA
- Re-enabled int8 gemm tests on gfx9
- Upgraded to C++17
- Restructured unit test folder for consistency
- Consolidated rocWMMA samples common code
-
-#### Tensile 4.37.0
-
-Tensile 4.37.0 for ROCm 5.6.0
-
-##### Added
-
- Added user driven tuning API
- Added decision tree fallback feature
- Added SingleBuffer + AtomicAdd option for GlobalSplitU
- DirectToVgpr support for fp16 and Int8 with TN orientation
- Added new test cases for various functions
- Added SingleBuffer algorithm for ZGEMM/CGEMM
- Added joblib for parallel map calls
- Added support for MFMA + LocalSplitU + DirectToVgprA+B
- Added asmcap check for MIArchVgpr
- Added support for MFMA + LocalSplitU
- Added frequency, power, and temperature data to the output
-
-##### Optimizations
-
- Improved the performance of GlobalSplitU with SingleBuffer algorithm
- Reduced the running time of the extended and pre_checkin tests
- Optimized the Tailloop section of the assembly kernel
- Optimized complex GEMM (fixed vgpr allocation, unified CGEMM and ZGEMM code in MulMIoutAlphaToArch)
- Improved the performance of the second kernel of MultipleBuffer algorithm
-
-##### Changed
-
- Updated custom kernels with 64-bit offsets
- Adapted 64-bit offset arguments for assembly kernels
- Improved temporary register re-use to reduce max sgpr usage
- Removed some restrictions on VectorWidth and DirectToVgpr
- Updated the dependency requirements for Tensile
- Changed the range of AssertSummationElementMultiple
- Modified the error messages for more clarity
- Changed DivideAndReminder to vectorStaticRemainder in case quotient is not used
- Removed dummy vgpr for vectorStaticRemainder
- Removed tmpVgpr parameter from vectorStaticRemainder/Divide/DivideAndReminder
- Removed qReg parameter from vectorStaticRemainder
-
-##### Fixed
-
- Fixed tmp sgpr allocation to avoid over-writing values (alpha)
- 64-bit offset parameters for post kernels
- Fixed gfx908 CI test failures
- Fixed offset calculation to prevent overflow for large offsets
- Fixed issues when BufferLoad and BufferStore are equal to zero
- Fixed StoreCInUnroll + DirectToVgpr + no useInitAccVgprOpt mismatch
- Fixed DirectToVgpr + LocalSplitU + FractionalLoad mismatch
- Fixed the memory access error related to StaggerU + large stride
- Fixed ZGEMM 4x4 MatrixInst mismatch
- Fixed DGEMM 4x4 MatrixInst mismatch
- Fixed ASEM + GSU + NoTailLoop opt mismatch
- Fixed AssertSummationElementMultiple + GlobalSplitU issues
- Fixed ASEM + GSU + TailLoop inner unroll
+* **Improved accuracy in result validation**.
+    You can now validate results for each iteration instead of just once for all iterations.
--- a/cmake/Modules/Dependencies.cmake
+++ b/cmake/Modules/Dependencies.cmake
@@ -0,0 +1,47 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ###########################
+# ROCm dependencies
+# ###########################
+
+include(FetchContent)
+
+if(BUILD_DOCS)
+  find_package(ROCM 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}") # First version with Sphinx doc gen improvement
+  if(NOT ROCM_FOUND)
+    message(STATUS "ROCm CMake not found. Fetching...")
+    set(rocm_cmake_tag
+      "c044bb52ba85058d28afe2313be98d9fed02e293" # develop@2023.09.12. (move to 6.0 tag when released)
+      CACHE STRING "rocm-cmake tag to download")
+    FetchContent_Declare(
+      rocm-cmake
+      GIT_REPOSITORY https://github.com/RadeonOpenCompute/rocm-cmake.git
+      GIT_TAG        ${rocm_cmake_tag}
+      SOURCE_SUBDIR "DISABLE ADDING TO BUILD" # We don't really want to consume the build and test targets of ROCm CMake.
+    )
+    FetchContent_MakeAvailable(rocm-cmake)
+    find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
+  else()
+    find_package(ROCM 0.11.0 CONFIG REQUIRED PATHS "${ROCM_PATH}")
+  endif()
+endif()
--- a/default.xml
+++ b/default.xml
@@ -1,75 +1,77 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
-    <remote name="roc-github"
-fetch="https://github.com/RadeonOpenCompute/" />
-    <remote name="rocm-devtools"
-fetch="https://github.com/ROCm-Developer-Tools/" />
-    <remote name="rocm-swplat"
-fetch="https://github.com/ROCmSoftwarePlatform/" />
-    <remote name="gpuopen-libs"
-fetch="https://github.com/GPUOpen-ProfessionalCompute-Libraries/" />
-    <remote name="gpuopen-tools"
-fetch="https://github.com/GPUOpen-Tools/" />
-    <remote name="KhronosGroup"
-fetch="https://github.com/KhronosGroup/" />
-    <default revision="refs/tags/rocm-5.6.0"
-     remote="roc-github"
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <remote name="roc-github" fetch="https://github.com/RadeonOpenCompute/" />
+    <remote name="rocm-devtools" fetch="https://github.com/ROCm-Developer-Tools/" />
+    <remote name="rocm-swplat" fetch="https://github.com/ROCmSoftwarePlatform/" />
+    <remote name="gpuopen-libs" fetch="https://github.com/GPUOpen-ProfessionalCompute-Libraries/" />
+    <remote name="gpuopen-tools" fetch="https://github.com/GPUOpen-Tools/" />
+    <remote name="KhronosGroup" fetch="https://github.com/KhronosGroup/" />
+    <default revision="refs/tags/rocm-6.0.0"
+     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
-<!--list of projects for ROCM-->
-    <project name="ROCK-Kernel-Driver" remote="roc-github" />
-    <project name="ROCT-Thunk-Interface" remote="roc-github" />
-    <project name="ROCR-Runtime" remote="roc-github" />
-    <project name="rocm_smi_lib" remote="roc-github" />
-    <project name="rocm-core" remote="roc-github" />
-    <project name="rocm-cmake" remote="roc-github" />
-    <project name="rocminfo" remote="roc-github" />
-    <project name="rocprofiler" remote="rocm-devtools" />
-    <project name="roctracer" remote="rocm-devtools" />
+<!--list of projects for ROCm-->
+    <project name="ROCK-Kernel-Driver" />
+    <project name="ROCT-Thunk-Interface" />
+    <project name="ROCR-Runtime" />
+    <project name="amdsmi" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocm-cmake" />
+    <project name="rocminfo" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocprofiler" />
+    <project name="roctracer" />
    <project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" revision="6c03f8b58fafd9dd693eaac826749a5cfad515f8" />
-    <project name="clang-ocl" remote="roc-github" />
+    <project name="clang-ocl" />
+    <project name="rdc" />
 <!--HIP Projects-->
-    <project name="HIP" remote="rocm-devtools" />
-    <project name="clr" remote="rocm-devtools" />
-    <project name="HIP-Examples" remote="rocm-devtools" />
-    <project name="HIPIFY" remote="rocm-devtools" />
-    <project name="HIPCC" remote="rocm-devtools" />
+    <project name="HIP" />
+    <project name="HIP-Examples" />
+    <project name="clr" />
+    <project name="hipother" />
+    <project name="HIPIFY" />
+    <project name="HIPCC" />
 <!-- The following projects are all associated with the AMDGPU LLVM compiler -->
-    <project name="llvm-project" remote="roc-github" />
-    <project name="ROCm-Device-Libs" remote="roc-github" />
-    <project name="ROCm-CompilerSupport" remote="roc-github" />
-    <project name="rocr_debug_agent" remote="rocm-devtools" />
-    <project name="rocm_bandwidth_test" remote="roc-github" />
-    <project name="half" remote="rocm-swplat" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
-    <project name="RCP" remote="gpuopen-tools" revision="3a49405a1500067c49d181844ec90aea606055bb" />
+    <project name="llvm-project" />
+    <project name="ROCm-Device-Libs" />
+    <project name="ROCm-CompilerSupport" />
+    <project name="half" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
 <!-- gdb projects -->
-    <project name="ROCgdb" remote="rocm-devtools" />
-    <project name="ROCdbgapi" remote="rocm-devtools" />
+    <project name="ROCgdb" />
+    <project name="ROCdbgapi" />
+    <project name="rocr_debug_agent" />
 <!-- ROCm Libraries -->
-    <project name="rdc" remote="roc-github" />
-    <project groups="mathlibs" name="rocBLAS" remote="rocm-swplat" />
-    <project groups="mathlibs" name="Tensile" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipBLAS" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocFFT" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipFFT" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocRAND" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocSPARSE" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocSOLVER" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipSOLVER" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipSPARSE" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocALUTION" remote="rocm-swplat" />
-    <project name="MIOpen" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rccl" remote="rocm-swplat" />
-    <project name="MIVisionX" remote="gpuopen-libs" />
-    <project groups="mathlibs" name="rocThrust" remote="rocm-swplat" />
-    <project groups="mathlibs" name="hipCUB" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocPRIM" remote="rocm-swplat" />
-    <project groups="mathlibs" name="rocWMMA" remote="rocm-swplat" />
-    <project name="hipfort" remote="rocm-swplat" />
-    <project name="AMDMIGraphX" remote="rocm-swplat" />
-    <project name="ROCmValidationSuite" remote="rocm-devtools" />
+    <project groups="mathlibs" name="rocBLAS" />
+    <project groups="mathlibs" name="Tensile" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipBLAS" />
+    <project groups="mathlibs" name="hipBLASLt" />
+    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="hipFFT" />
+    <project groups="mathlibs" name="rocRAND" />
+    <project groups="mathlibs" name="hipRAND" />
+    <project groups="mathlibs" name="rocSPARSE" />
+    <project groups="mathlibs" name="hipSPARSELt" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipSPARSE" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocThrust" />
+    <project groups="mathlibs" name="hipCUB" />
+    <project groups="mathlibs" name="rocPRIM" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rccl" />
+    <project name="MIOpen" />
+    <project name="composable_kernel" />
+    <project name="MIVisionX" />
+    <project name="rpp" />
+    <project name="hipfort" />
+    <project name="AMDMIGraphX" />
+    <project name="ROCmValidationSuite" />
 <!-- Projects for OpenMP-Extras -->
-    <project name="aomp" path="openmp-extras/aomp" remote="rocm-devtools" />
-    <project name="aomp-extras" path="openmp-extras/aomp-extras" remote="rocm-devtools" />
-    <project name="flang" path="openmp-extras/flang" remote="rocm-devtools" />
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
 </manifest>
--- a/docs/404.md
+++ b/docs/404.md
@@ -1,6 +0,0 @@
-# 404 Page Not Found
-
-Page could not be found.
-
-Return to [home](./index) or please use the links from the sidebar to find what
-you are looking for.
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,33 @@
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+include(ROCMSphinxDoc)
+
+rocm_add_sphinx_doc(
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+  OUTPUT_DIR html
+  BUILDER html
+)
+  
+install(
+  DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html"
+  DESTINATION "${CMAKE_INSTALL_DOCDIR}")
--- a/docs/about.md
+++ b/docs/about.md
@@ -1,74 +0,0 @@
-# About ROCm Documentation
-
-ROCm documentation is made available under open source [licenses](licensing.md).
-Documentation is built using open source toolchains. Contributions to our
-documentation is encouraged and welcome. As a contributor, please familiarize
-yourself with our documentation toolchain.
-
-## ReadTheDocs
-
-[ReadTheDocs](https://docs.readthedocs.io/en/stable/) is our front end for the
-our documentation. By front end, this is the tool that serves our HTML based
-documentation to our end users.
-
-## Doxygen
-
-[Doxygen](https://www.doxygen.nl/) is the most common inline code documentation
-standard. ROCm projects are use Doxygen for public API documentation (unless the
-upstream project is using a different tool).
-
-## Sphinx
-
-[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator
-originally used for python. It is now widely used in the Open Source community.
-Originally, sphinx supported RST based documentation. Markdown support is now
-available. ROCm documentation plans to default to markdown for new projects.
-Existing projects using RST are under no obligation to convert to markdown. New
-projects that believe markdown is not suitable should contact the documentation
-team prior to selecting RST.
-
-### MyST
-
-[Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended
-flavor of Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and Sphinx.
-It is integrated via [`myst-parser`](https://myst-parser.readthedocs.io/en/latest/).
-A cheat sheet that showcases how to use the MyST syntax is available over at [the Jupyter
-reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html).
-
-### Sphinx Theme
-
-ROCm is using the
-[Sphinx Book Theme](https://sphinx-book-theme.readthedocs.io/en/latest/). This
-theme is used by Jupyter books. ROCm documentation applies some customization
-include a header and footer on top of the Sphinx Book Theme. A future custom
-ROCm theme will be part of our documentation goals.
-
-### Sphinx Design
-
-Sphinx Design is an extension for sphinx based websites that add design
-functionality. Please see the documentation
-[here](https://sphinx-design.readthedocs.io/en/latest/index.html). ROCm
-documentation uses sphinx design for grids, cards, and synchronized tabs.
-Other features may be used in the future.
-
-### Sphinx External TOC
-
-ROCm uses the
-[sphinx-external-toc](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html)
-for our navigation. This tool allows a YAML file based left navigation menu. This
-tool was selected due to its flexibility that allows scripts to operate on the
-YAML file. Please transition to this file for the project's navigation. You can
-see the `_toc.yml.in` file in this repository in the docs/sphinx folder for an
-example.
-
-### Breathe
-
-Sphinx uses [Breathe](https://www.breathe-doc.org/) to integrate Doxygen
-content.
-
-## `rocm-docs-core` pip package
-
-[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) is an AMD
-maintained project that applies customization for our documentation. This
-project is the tool most ROCm repositories will use as part of the documentation
-build.
--- a/docs/about/compatibility/openmp.md
+++ b/docs/about/compatibility/openmp.md
@@ -1,4 +1,10 @@
-# OpenMP Support in ROCm
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="OpenMP support in ROCm">
+  <meta name="keywords" content="OpenMP, LLVM, OpenMP toolchain">
+</head>
+
+# OpenMP support in ROCm

 ## Introduction

@@ -9,7 +15,13 @@ Along with host APIs, the OpenMP compilers support offloading code and data onto
 GPU devices. This document briefly describes the installation location of the
 OpenMP toolchain, example usage of device offloading, and usage of `rocprof`
 with OpenMP applications. The GPUs supported are the same as those supported by
-this ROCm release. See the list of supported GPUs in {doc}`/release/gpu_os_support`.
+this ROCm release. See the list of supported GPUs for {doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
+{doc}`Windows<rocm-install-on-windows:reference/system-requirements>`.
+
+The ROCm OpenMP compiler is implemented using LLVM compiler technology.
+The following image illustrates the internal steps taken to translate a user’s application into an executable that can offload computation to the AMDGPU. The compilation is a two-pass process. Pass 1 compiles the application to generate the CPU code and Pass 2 links the CPU code to the AMDGPU device code.
+
+![OpenMP toolchain](../../data/reference/openmp/openmp-toolchain.svg "OpenMP toolchain")

 ### Installation

@@ -17,17 +29,13 @@ The OpenMP toolchain is automatically installed as part of the standard ROCm
 installation and is available under `/opt/rocm-{version}/llvm`. The
 sub-directories are:

-bin: Compilers (`flang` and `clang`) and other binaries.
+* bin: Compilers (`flang` and `clang`) and other binaries.
+* examples: The usage section below shows how to compile and run these programs.
+* include: Header files.
+* lib: Libraries including those required for target offload.
+* lib-debug: Debug versions of the above libraries.

- examples: The usage section below shows how to compile and run these programs.
-
- include: Header files.
-
- lib: Libraries including those required for target offload.
-
- lib-debug: Debug versions of the above libraries.
-
-## OpenMP: Usage
+## OpenMP: usage

 The example programs can be compiled and run by pointing the environment
 variable `ROCM_PATH` to the ROCm install directory.
@@ -56,7 +64,7 @@ that are required for target offload from an OpenMP program:
 The compiler also accepts the alternative offloading notation:

 ```bash
-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=<gpu-arch> 
+-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=<gpu-arch>
 ```

 :::
@@ -107,10 +115,9 @@ code compiled with AOMP:
   options --list-basic and --list-derived. `rocprof` accepts either a text or
   an XML file as an input.

-For more details on `rocprof`, refer to the ROCm Profiling Tools document on
-{doc}`rocprofiler:rocprof`.
+For more details on `rocprof`, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.

-### Using Tracing Options
+### Using tracing options

 **Prerequisite:** When using the `--sys-trace` option, compile the OpenMP
 program with:
@@ -121,10 +128,10 @@ program with:

 The following tracing options are widely used to generate useful information:

- **`--hsa-trace`**: This option is used to get a JSON output file with the HSA
+* **`--hsa-trace`**: This option is used to get a JSON output file with the HSA
  API execution traces and a flat profile in a CSV file.

- **`--sys-trace`**: This allows programmers to trace both HIP and HSA calls.
+* **`--sys-trace`**: This allows programmers to trace both HIP and HSA calls.
  Since this option results in loading ``libamdhip64.so``, follow the
  prerequisite as mentioned above.

@@ -134,38 +141,46 @@ Google Chrome at chrome://tracing/ or [Perfetto](https://perfetto.dev/).
 Navigate to Chrome or Perfetto and load the JSON file to see the timeline of the
 HSA calls.

-For more details on tracing, refer to the ROCm Profiling Tools document on
-{doc}`rocprofiler:rocprof`.
+For more details on tracing, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.

-### Environment Variables
+### Environment variables

 :::{table}
 :widths: auto
-| Environment Variable        | Description                  |
+| Environment Variable        | Purpose                  |
 | --------------------------- | ---------------------------- |
-| `OMP_NUM_TEAMS`             | The implementation chooses the number of teams for kernel launch. The user can change this number for performance tuning using this environment variable, subject to implementation limits. |
-| `LIBOMPTARGET_KERNEL_TRACE` | This environment variable is used to print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. |
-| `LIBOMPTARGET_INFO`         | This environment variable is used to print informational messages from the device runtime as the program executes. Users can request fine-grain information by setting it to the value of 1 or higher and can set the value of -1 for complete information. |
-| `LIBOMPTARGET_DEBUG`        | If a debug version of the device library is present, setting this environment variable to 1 and using that library emits further detailed debugging information about data transfer operations and kernel launch. |
-| `GPU_MAX_HW_QUEUES`         | This environment variable is used to set the number of HSA queues in the OpenMP runtime. |
+| `OMP_NUM_TEAMS`             | To set the number of teams for kernel launch, which is otherwise chosen by the implementation by default. You can set this number (subject to implementation limits) for performance tuning. |
+| `LIBOMPTARGET_KERNEL_TRACE` | To print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. |
+| `LIBOMPTARGET_INFO`         | To print informational messages from the device runtime as the program executes. Setting it to a value of 1 or higher, prints fine-grain information and setting it to -1 prints complete information. |
+| `LIBOMPTARGET_DEBUG`        | To get detailed debugging information about data transfer operations and kernel launch when using a debug version of the device library. Set this environment variable to 1 to get the detailed information from the library. |
+| `GPU_MAX_HW_QUEUES`         | To set the number of HSA queues in the OpenMP runtime. The HSA queues are created on demand up to the maximum value as supplied here. The queue creation starts with a single initialized queue to avoid unnecessary allocation of resources. The provided value is capped if it exceeds the recommended, device-specific value. |
+| `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` | To set the threshold size up to which data transfers are initiated asynchronously. The default threshold size is 1*1024*1024 bytes (1MB). |
+| `OMPX_FORCE_SYNC_REGIONS` | To force the runtime to execute all operations synchronously, i.e., wait for an operation to complete immediately. This affects data transfers and kernel execution. While it is mainly designed for debugging, it may have a minor positive effect on performance in certain situations. |
 :::

-## OpenMP: Features
+## OpenMP: features

 The OpenMP programming model is greatly enhanced with the following new features
 implemented in the past releases.

 (openmp_usm)=

-### Asynchronous Behavior in OpenMP Target Regions
+### Asynchronous behavior in OpenMP target regions
+
+* Controlling Asynchronous Behavior
+
+The OpenMP offloading runtime executes in an asynchronous fashion by default, allowing multiple data transfers to start concurrently. However, if the data to be transferred becomes larger than the default threshold of 1MB, the runtime falls back to a synchronous data transfer. The buffers that have been locked already are always executed asynchronously.
+You can overrule this default behavior by setting `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` and `OMPX_FORCE_SYNC_REGIONS`. See the [Environment Variables](#environment-variables) table for details.
+
+* Multithreaded Offloading on the Same Device

- Multithreaded offloading on the same device
 The `libomptarget` plugin for GPU offloading allows creation of separate configurable HSA queues per chiplet, which enables two or more threads to concurrently offload to the same device.

- Parallel memory copy invocations
+* Parallel Memory Copy Invocations
+
 Implicit asynchronous execution of single target region enables parallel memory copy invocations.

-### Unified Shared Memory
+### Unified shared memory

 Unified Shared Memory (USM) provides a pointer-based approach to memory
 management. To implement USM, fulfill the following system requirements along
@@ -173,14 +188,12 @@ with Xnack capability.

 #### Prerequisites

- Linux Kernel versions above 5.14
-
- Latest KFD driver packaged in ROCm stack
-
- Xnack, as USM support can only be tested with applications compiled with Xnack
+* Linux Kernel versions above 5.14
+* Latest KFD driver packaged in ROCm stack
+* Xnack, as USM support can only be tested with applications compiled with Xnack
  capability

-#### Xnack Capability
+#### Xnack capability

 When enabled, Xnack capability allows GPU threads to access CPU (system) memory,
 allocated with OS-allocators, such as `malloc`, `new`, and `mmap`. Xnack must be
@@ -206,15 +219,15 @@ HSA_XNACK=1

 When Xnack support is not needed:

- Build the applications to maximize resource utilization using:
+* Build the applications to maximize resource utilization using:

 ```bash
 --offload-arch=gfx908:xnack-
 ```

- At runtime, set the `HSA_XNACK` environment variable to 0.
+* At runtime, set the `HSA_XNACK` environment variable to 0.

-#### Unified Shared Memory Pragma
+#### Unified shared memory pragma

 This OpenMP pragma is available on MI200 through `xnack+` support.

@@ -268,7 +281,7 @@ to by “b” are in coarse-grain memory during and after the execution of the
 target region. This is accomplished in the OpenMP runtime library with calls to
 the ROCr runtime to set the pages pointed by “b” as coarse grain.

-### OMPT Target Support
+### OMPT target support

 The OpenMP runtime in ROCm implements a subset of the OMPT device APIs, as
 described in the OpenMP specification document. These APIs allow first-party
@@ -293,7 +306,7 @@ The file `veccopy-ompt-target-tracing.c` simulates how a tool initiates device
 activity tracing. The file `callbacks.h` shows the callbacks registered and
 implemented by the tool.

-### Floating Point Atomic Operations
+### Floating point atomic operations

 The MI200-series GPUs support the generation of hardware floating-point atomics
 using the OpenMP atomic pragma. The support includes single- and
@@ -317,8 +330,10 @@ double a = 0.0;
 a = a + 1.0;
 ```

-NOTE `AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and
+:::{note}
+`AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and
 `AMD_safe_fp_atomics` is implemented with a compare-and-swap loop.
+:::

 To disable the generation of fast floating-point atomic instructions at the file
 level, build using the option `-msafe-fp-atomics` or use a hint clause on a
@@ -351,44 +366,36 @@ double b = 0.0;
 b = b + 1.0;
 ```

-### Address Sanitizer (ASan) Tool
+### AddressSanitizer tool

-Address Sanitizer is a memory error detector tool utilized by applications to
+AddressSanitizer (ASan) is a memory error detector tool utilized by applications to
 detect various errors ranging from spatial issues such as out-of-bound access to
 temporal issues such as use-after-free. The AOMP compiler supports ASan for AMD
 GPUs with applications written in both HIP and OpenMP.

-**Features Supported on Host Platform (Target x86_64):**
+**Features supported on host platform (Target x86_64):**

- Use-after-free
+* Use-after-free
+* Buffer overflows
+* Heap buffer overflow
+* Stack buffer overflow
+* Global buffer overflow
+* Use-after-return
+* Use-after-scope
+* Initialization order bugs

- Buffer overflows
+**Features supported on AMDGPU platform (`amdgcn-amd-amdhsa`):**

- Heap buffer overflow
+* Heap buffer overflow
+* Global buffer overflow

- Stack buffer overflow
-
- Global buffer overflow
-
- Use-after-return
-
- Use-after-scope
-
- Initialization order bugs
-
-**Features Supported on AMDGPU Platform (`amdgcn-amd-amdhsa`):**
-
- Heap buffer overflow
-
- Global buffer overflow
-
-**Software (Kernel/OS) Requirements:** Unified Shared Memory support with Xnack
+**Software (kernel/OS) requirements:** Unified Shared Memory support with Xnack
 capability. See the section on [Unified Shared Memory](#unified-shared-memory)
 for prerequisites and details on Xnack.

 **Example:**

- Heap buffer overflow
+* Heap buffer overflow

 ```bash
 void  main() {
@@ -408,7 +415,7 @@ void  main() {
 See the complete sample code for heap buffer overflow
 [here](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/examples/tools/asan/heap_buffer_overflow/openmp/vecadd-HBO.cpp).

- Global buffer overflow
+* Global buffer overflow

 ```bash
 #pragma omp declare target
@@ -433,46 +440,44 @@ for(int i=0; i<N; i++){
 See the complete sample code for global buffer overflow
 [here](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/examples/tools/asan/global_buffer_overflow/openmp/vecadd-GBO.cpp).

-### Clang Compiler Option for Kernel Optimization
+### Clang compiler option for kernel optimization

 You can use the clang compiler option `-fopenmp-target-fast` for kernel optimization if certain constraints implied by its component options are satisfied. `-fopenmp-target-fast` enables the following options:

- `-fopenmp-target-ignore-env-vars`: It enables code generation of specialized kernels including No-loop and Cross-team reductions.
+* `-fopenmp-target-ignore-env-vars`: It enables code generation of specialized kernels including no-loop and Cross-team reductions.

- `-fopenmp-assume-no-thread-state`: It enables the compiler to assume that no thread in a parallel region modifies an Internal Control Variable (`ICV`), thus potentially reducing the device runtime code execution.
+* `-fopenmp-assume-no-thread-state`: It enables the compiler to assume that no thread in a parallel region modifies an Internal Control Variable (`ICV`), thus potentially reducing the device runtime code execution.

- `-fopenmp-assume-no-nested-parallelism`: It enables the compiler to assume that no thread in a parallel region encounters a parallel region, thus potentially reducing the device runtime code execution.
+* `-fopenmp-assume-no-nested-parallelism`: It enables the compiler to assume that no thread in a parallel region encounters a parallel region, thus potentially reducing the device runtime code execution.

- `-O3` if no `-O*` is specified by the user.
+* `-O3` if no `-O*` is specified by the user.

-### Specialized Kernels
+### Specialized kernels

 Clang will attempt to generate specialized kernels based on compiler options and OpenMP constructs. The following specialized kernels are supported:

- No-Loop
-
- Big-Jump-Loop
-
- Cross-Team (Xteam) Reductions
+* No-loop
+* Big-jump-loop
+* Cross-team reductions

 To enable the generation of specialized kernels, follow these guidelines:

- Do not specify teams, threads, and schedule-related environment variables. The `num_teams` clause in an OpenMP target construct acts as an override and prevents the generation of the No-Loop kernel. If the specification of `num_teams` clause is a user requirement then clang tries to generate the Big-Jump-Loop kernel instead of the No-Loop kernel.
+* Do not specify teams, threads, and schedule-related environment variables. The `num_teams` clause in an OpenMP target construct acts as an override and prevents the generation of the no-loop kernel. If the specification of `num_teams` clause is a user requirement then clang tries to generate the big-jump-loop kernel instead of the no-loop kernel.

- Assert the absence of the teams, threads, and schedule-related environment variables by adding the command-line option `-fopenmp-target-ignore-env-vars`.
+* Assert the absence of the teams, threads, and schedule-related environment variables by adding the command-line option `-fopenmp-target-ignore-env-vars`.

- To automatically enable the specialized kernel generation, use `-Ofast` or `-fopenmp-target-fast` for compilation.
+* To automatically enable the specialized kernel generation, use `-Ofast` or `-fopenmp-target-fast` for compilation.

- To disable specialized kernel generation, use `-fno-openmp-target-ignore-env-vars`.
+* To disable specialized kernel generation, use `-fno-openmp-target-ignore-env-vars`.

-#### No-Loop Kernel Generation
+#### No-loop kernel generation

-The No-loop kernel generation feature optimizes the compiler performance by generating a specialized kernel for certain OpenMP target constructs such as target teams distribute parallel for. The specialized kernel generation feature assumes every thread executes a single iteration of the user loop, which leads the runtime to launch a total number of GPU threads equal to or greater than the iteration space size of the target region loop. This allows the compiler to generate code for the loop body without an enclosing loop, resulting in reduced control-flow complexity and potentially better performance.
+The no-loop kernel generation feature optimizes the compiler performance by generating a specialized kernel for certain OpenMP target constructs such as target teams distribute parallel for. The specialized kernel generation feature assumes every thread executes a single iteration of the user loop, which leads the runtime to launch a total number of GPU threads equal to or greater than the iteration space size of the target region loop. This allows the compiler to generate code for the loop body without an enclosing loop, resulting in reduced control-flow complexity and potentially better performance.

-#### Big-Jump-Loop Kernel Generation
+#### Big-jump-loop kernel generation

-A No-Loop kernel is not generated if the OpenMP teams construct uses a `num_teams` clause. Instead, the compiler attempts to generate a different specialized kernel called the Big-Jump-Loop kernel. The compiler launches the kernel with a grid size determined by the number of teams specified by the OpenMP `num_teams` clause and the `blocksize` chosen either by the compiler or specified by the corresponding OpenMP clause.
+A no-loop kernel is not generated if the OpenMP teams construct uses a `num_teams` clause. Instead, the compiler attempts to generate a different specialized kernel called the big-jump-loop kernel. The compiler launches the kernel with a grid size determined by the number of teams specified by the OpenMP `num_teams` clause and the `blocksize` chosen either by the compiler or specified by the corresponding OpenMP clause.

-#### Xteam Optimized Reduction Kernel Generation
+#### Cross-team optimized reduction kernel generation

-If the OpenMP construct has a reduction clause, the compiler attempts to generate optimized code by utilizing efficient Xteam communication. New APIs for Xteam reduction are implemented in the device runtime and are automatically generated by clang.
+If the OpenMP construct has a reduction clause, the compiler attempts to generate optimized code by utilizing efficient cross-team communication. New APIs for cross-team reduction are implemented in the device runtime and are automatically generated by clang.
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -0,0 +1,13 @@
+# License
+
+:::{note}
+This license applies to the [ROCm repository](https://github.com/RadeonOpenCompute/ROCm) that
+primarily contains documentation. For other licensing information, refer to the
+[Licensing Terms page](./licensing).
+:::
+
+```{include} ../../LICENSE
+```
+
+```{include} ./licensing.md
+```
--- a/docs/about/licensing.md
+++ b/docs/about/licensing.md
@@ -0,0 +1,133 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm licensing terms">
+  <meta name="keywords" content="license, licensing terms">
+</head>
+
+# ROCm licensing terms
+
+ROCm™ is released by Advanced Micro Devices, Inc. and is licensed per component separately.
+The following table is a list of ROCm components with links to their respective license
+terms. These components may include third party components subject to
+additional licenses. Please review individual repositories for more information.
+
+The table shows ROCm components, the name of license, and link to the license terms.
+The table is ordered to follow the ROCm manifest file.
+
+<!-- spellcheck-disable -->
+| Component | License |
+|:---------------------|:-------------------------|
+| [AMDMIGraphX](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/) | [MIT](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/blob/develop/LICENSE) |
+| [HIPCC](https://github.com/ROCm-Developer-Tools/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm-Developer-Tools/HIPCC/blob/develop/LICENSE.txt) |
+| [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/) | [MIT](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/amd-staging/LICENSE.txt) |
+| [HIP](https://github.com/ROCm-Developer-Tools/HIP/) | [MIT](https://github.com/ROCm-Developer-Tools/HIP/blob/develop/LICENSE.txt) |
+| [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/) | [MIT](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/blob/master/LICENSE.txt) |
+| [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen/) | [MIT](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/master/LICENSE.txt) |
+| [MIVisionX](https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/) | [MIT](https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/blob/master/LICENSE.txt) |
+| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
+| [ROCK-Kernel-Driver](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/COPYING) |
+| [ROCR-Runtime](https://github.com/RadeonOpenCompute/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/LICENSE.txt) |
+| [ROCT-Thunk-Interface](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/) | [MIT](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
+| [ROCclr](https://github.com/ROCm-Developer-Tools/ROCclr/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCclr/blob/develop/LICENSE.txt) |
+| [ROCdbgapi](https://github.com/ROCm-Developer-Tools/ROCdbgapi/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCdbgapi/blob/amd-master/LICENSE.txt) |
+| [ROCgdb](https://github.com/ROCm-Developer-Tools/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm-Developer-Tools/ROCgdb/blob/amd-master/COPYING) |
+| [ROCm-CompilerSupport](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
+| [ROCm-Device-Libs](https://github.com/RadeonOpenCompute/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
+| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
+| [ROCm-OpenCL-Runtime](https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
+| [ROCmValidationSuite](https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/) | [MIT](https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/blob/master/LICENSE) |
+| [Tensile](https://github.com/ROCmSoftwarePlatform/Tensile/) | [MIT](https://github.com/ROCmSoftwarePlatform/Tensile/blob/develop/LICENSE.md) |
+| [aomp-extras](https://github.com/ROCm-Developer-Tools/aomp-extras/) | [MIT](https://github.com/ROCm-Developer-Tools/aomp-extras/blob/aomp-dev/LICENSE) |
+| [aomp](https://github.com/ROCm-Developer-Tools/aomp/) | [Apache 2.0](https://github.com/ROCm-Developer-Tools/aomp/blob/aomp-dev/LICENSE) |
+| [atmi](https://github.com/RadeonOpenCompute/atmi/) | [MIT](https://github.com/RadeonOpenCompute/atmi/blob/master/LICENSE.txt) |
+| [clang-ocl](https://github.com/RadeonOpenCompute/clang-ocl/) | [MIT](https://github.com/RadeonOpenCompute/clang-ocl/blob/master/LICENSE) |
+| [flang](https://github.com/ROCm-Developer-Tools/flang/) | [Apache 2.0](https://github.com/ROCm-Developer-Tools/flang/blob/master/LICENSE.txt) |
+| [half](https://github.com/ROCmSoftwarePlatform/half/) | [MIT](https://github.com/ROCmSoftwarePlatform/half/blob/master/LICENSE.txt) |
+| [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipBLAS/blob/develop/LICENSE.md) |
+| [hipCUB](https://github.com/ROCmSoftwarePlatform/hipCUB/) | [Custom](https://github.com/ROCmSoftwarePlatform/hipCUB/blob/develop/LICENSE.txt) |
+| [hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipFFT/blob/develop/LICENSE.md) |
+| [hipSOLVER](https://github.com/ROCmSoftwarePlatform/hipSOLVER/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSOLVER/blob/develop/LICENSE.md) |
+| [hipSPARSELt](https://github.com/ROCmSoftwarePlatform/hipSPARSELt/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSPARSELt/blob/develop/LICENSE.md) |
+| [hipSPARSE](https://github.com/ROCmSoftwarePlatform/hipSPARSE/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/LICENSE.md) |
+| [hipTensor](https://github.com/ROCmSoftwarePlatform/hipTensor) | [MIT](https://github.com/ROCmSoftwarePlatform/hipTensor/blob/develop/LICENSE) |
+| [hipamd](https://github.com/ROCm-Developer-Tools/hipamd/) | [MIT](https://github.com/ROCm-Developer-Tools/hipamd/blob/develop/LICENSE.txt) |
+| [hipfort](https://github.com/ROCmSoftwarePlatform/hipfort/) | [MIT](https://github.com/ROCmSoftwarePlatform/hipfort/blob/master/LICENSE) |
+| [llvm-project](https://github.com/ROCm-Developer-Tools/llvm-project/) | [Apache](https://github.com/ROCm-Developer-Tools/llvm-project/blob/main/LICENSE.TXT) |
+| [rccl](https://github.com/ROCmSoftwarePlatform/rccl/) | [Custom](https://github.com/ROCmSoftwarePlatform/rccl/blob/develop/LICENSE.txt) |
+| [rdc](https://github.com/RadeonOpenCompute/rdc/) | [MIT](https://github.com/RadeonOpenCompute/rdc/blob/master/LICENSE) |
+| [rocALUTION](https://github.com/ROCmSoftwarePlatform/rocALUTION/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocALUTION/blob/develop/LICENSE.md) |
+| [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/LICENSE.md) |
+| [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocFFT/blob/develop/LICENSE.md) |
+| [rocPRIM](https://github.com/ROCmSoftwarePlatform/rocPRIM/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocPRIM/blob/develop/LICENSE.txt) |
+| [rocRAND](https://github.com/ROCmSoftwarePlatform/rocRAND/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocRAND/blob/develop/LICENSE.txt) |
+| [rocSOLVER](https://github.com/ROCmSoftwarePlatform/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCmSoftwarePlatform/rocSOLVER/blob/develop/LICENSE.md) |
+| [rocSPARSE](https://github.com/ROCmSoftwarePlatform/rocSPARSE/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocSPARSE/blob/develop/LICENSE.md) |
+| [rocThrust](https://github.com/ROCmSoftwarePlatform/rocThrust/) | [Apache 2.0](https://github.com/ROCmSoftwarePlatform/rocThrust/blob/develop/LICENSE) |
+| [rocWMMA](https://github.com/ROCmSoftwarePlatform/rocWMMA/) | [MIT](https://github.com/ROCmSoftwarePlatform/rocWMMA/blob/develop/LICENSE.md) |
+| [rocm-cmake](https://github.com/RadeonOpenCompute/rocm-cmake/) | [MIT](https://github.com/RadeonOpenCompute/rocm-cmake/blob/develop/LICENSE) |
+| [rocm_bandwidth_test](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/blob/master/LICENSE.txt) |
+| [rocm_smi_lib](https://github.com/RadeonOpenCompute/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/License.txt) |
+| [rocminfo](https://github.com/RadeonOpenCompute/rocminfo/) | [The University of Illinois/NCSA](https://github.com/RadeonOpenCompute/rocminfo/blob/master/License.txt) |
+| [rocprofiler](https://github.com/ROCm-Developer-Tools/rocprofiler/) | [MIT](https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/LICENSE) |
+| [rocr_debug_agent](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/blob/master/LICENSE.txt) |
+| [roctracer](https://github.com/ROCm-Developer-Tools/roctracer/) | [MIT](https://github.com/ROCm-Developer-Tools/roctracer/blob/amd-master/LICENSE) |
+| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)
+
+Open sourced ROCm components are released via public GitHub
+repositories, packages on https://repo.radeon.com and other distribution channels.
+Proprietary products are only available on https://repo.radeon.com. Currently, only
+one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
+Proprietary components are organized in a proprietary subdirectory in the package
+repositories to distinguish from open sourced packages.
+
+The additional terms and conditions below apply to your use of ROCm technical
+documentation.
+
+©2023 Advanced Micro Devices, Inc. All rights reserved.
+
+The information presented in this document is for informational purposes only
+and may contain technical inaccuracies, omissions, and typographical errors. The
+information contained herein is subject to change and may be rendered inaccurate
+for many reasons, including but not limited to product and roadmap changes,
+component and motherboard version changes, new model and/or product releases,
+product differences between differing manufacturers, software changes, BIOS
+flashes, firmware upgrades, or the like. Any computer system has risks of
+security vulnerabilities that cannot be completely prevented or mitigated. AMD
+assumes no obligation to update or otherwise correct or revise this information.
+However, AMD reserves the right to revise this information and to make changes
+from time to time to the content hereof without obligation of AMD to notify any
+person of such revisions or changes.
+
+THIS INFORMATION IS PROVIDED “AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES
+WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY
+INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD
+SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT,
+MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE
+LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER
+CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN,
+EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+AMD, the AMD Arrow logo, ROCm, and combinations thereof are trademarks of
+Advanced Micro Devices, Inc. Other product names used in this publication are
+for identification purposes only and may be trademarks of their respective
+companies.
+
+## Package licensing
+
+:::{attention}
+AQL Profiler and AOCC CPU optimization are both provided in binary form, each
+subject to the license agreement enclosed in the directory for the binary and is
+available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
+copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
+the terms and conditions of this license agreement. If you do not agree to the
+terms of this agreement, do not install, copy or use the AQL Profiler and/or the
+AOCC CPU Optimizations.
+:::
+
+For the rest of the ROCm packages, you can find the licensing information at the
+following location: `/opt/rocm/share/doc/<component-name>/`
+
+For example, you can fetch the licensing information of the `_amd_comgr_`
+component (Code Object Manager) from the `amd_comgr` folder. A file named
+`LICENSE.txt` contains the license details at:
+`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`
--- a/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
+++ b/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
@@ -0,0 +1,157 @@
+.. meta::
+   :description: How ROCm uses PCIe atomics
+   :keywords: PCIe, PCIe atomics, atomics, BAR memory, AMD, ROCm
+
+*****************************************************************************
+How ROCm uses PCIe atomics
+*****************************************************************************
+
+ROCm PCIe feature and overview of BAR memory
+================================================================
+
+ROCm is an extension of HSA platform architecture, so it shares the queuing model, memory model,
+signaling and synchronization protocols. Platform atomics are integral to perform queuing and
+signaling memory operations where there may be multiple-writers across CPU and GPU agents.
+
+The full list of HSA system architecture platform requirements are here:
+`HSA Sys Arch Features <http://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf>`_.
+
+AMD ROCm Software uses the new PCI Express 3.0 (Peripheral Component Interconnect Express [PCIe]
+3.0) features for atomic read-modify-write transactions which extends inter-processor synchronization
+mechanisms to IO to support the defined set of HSA capabilities needed for queuing and signaling
+memory operations.
+
+The new PCIe atomic operations operate as completers for ``CAS`` (Compare and Swap), ``FetchADD``,
+``SWAP`` atomics. The atomic operations are initiated by the I/O device which support 32-bit, 64-bit and
+128-bit operand which target address have to be naturally aligned to operation sizes.
+
+For ROCm the Platform atomics are used in ROCm in the following ways:
+
+  * Update HSA queue's read_dispatch_id: 64 bit atomic add used by the command processor on the
+    GPU agent to update the packet ID it processed.
+  * Update HSA queue's write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to
+    support multi-writer queue insertions.
+  * Update HSA Signals -- 64bit atomic ops are used for CPU & GPU synchronization.
+
+The PCIe 3.0 atomic operations feature allows atomic transactions to be requested by, routed through
+and completed by PCIe components. Routing and completion does not require software support.
+Component support for each is detectable via the Device Capabilities 2 (DevCap2) register. Upstream
+bridges need to have atomic operations routing enabled or the atomic operations will fail even though
+PCIe endpoint and PCIe I/O devices has the capability to atomic operations.
+
+To do atomic operations routing capability between two or more Root Ports, each associated Root Port
+must indicate that capability via the atomic operations routing supported bit in the DevCap2 register.
+
+If your system has a PCIe Express Switch it needs to support atomic operations routing. Atomic
+operations requests are permitted only if a component's ``DEVCTL2.ATOMICOP_REQUESTER_ENABLE``
+field is set. These requests can only be serviced if the upstream components support atomic operation
+completion and/or routing to a component which does. Atomic operations routing support=1, routing
+is supported; atomic operations routing support=0, routing is not supported.
+
+An atomic operation is a non-posted transaction supporting 32-bit and 64-bit address formats, there
+must be a response for Completion containing the result of the operation. Errors associated with the
+operation (uncorrectable error accessing the target location or carrying out the atomic operation) are
+signaled to the requester by setting the Completion Status field in the completion descriptor, they are
+set to to Completer Abort (CA) or Unsupported Request (UR).
+
+To understand more about how PCIe atomic operations work, see
+`PCIe atomics <https://pcisig.com/specifications/pciexpress/specifications/ECN_Atomic_Ops_080417.pdf>`_
+
+`Linux Kernel Patch to pci_enable_atomic_request <https://patchwork.kernel.org/project/linux-pci/patch/1443110390-4080-1-git-send-email-jay@jcornwall.me/>`_
+
+There are also a number of papers which talk about these new capabilities:
+
+  * `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
+  * `PCI express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
+  * `Intel PCIe Generation 3 Hotchips Paper <https://www.hotchips.org/wp-content/uploads/hc_archives/hc21/1_sun/HC21.23.1.SystemInterconnectTutorial-Epub/HC21.23.131.Ajanovic-Intel-PCIeGen3.pdf>`_
+  * `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
+
+Other I/O devices with PCIe atomics support
+
+  * `Mellanox ConnectX-5 InfiniBand Card <http://www.mellanox.com/related-docs/prod_adapter_cards/PB_ConnectX-5_VPI_Card.pdf>`_
+  * `Cray Aries Interconnect <http://www.hoti.org/hoti20/slides/Bob_Alverson.pdf>`_
+  * `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
+  * `Xilinx 7 Series Devices <https://docs.xilinx.com/v/u/1nfXeFNnGpA0ywyykvWHWQ>`_
+
+Future bus technology with richer I/O atomics operation Support
+
+  * GenZ
+
+New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPUs
+with PCIe Generation 3.0 support.
+
+  * `Mellanox Bluefield SOC <https://docs.nvidia.com/networking/display/BlueFieldSWv25111213/BlueField+Software+Overview>`_
+  * `Cavium Thunder X2 <https://en.wikichip.org/wiki/cavium/thunderx2>`_
+
+In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU
+originates two writes to two different targets:
+
+* Write to another GPU memory
+* Write to system memory to indicate transfer complete
+
+They are routed off to different ends of the computer but we want to make sure the write to system
+memory to indicate transfer complete occurs AFTER P2P write to GPU has complete.
+
+BAR memory overview
+----------------------------------------------------------------------------------------------------
+On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing, if so he need to set
+memory-mapped input/output (MMIO) base address (MMIOH base) and range (MMIO high size) in the BIOS.
+
+In the Supermicro system in the system bios you need to see the following
+
+  * Advanced->PCIe/PCI/PnP configuration-\> Above 4G Decoding = Enabled
+  * Advanced->PCIe/PCI/PnP Configuration-\>MMIOH Base = 512G
+  * Advanced->PCIe/PCI/PnP Configuration-\>MMIO High Size = 256G
+
+When we support Large Bar Capability there is a Large Bar VBIOS which also disable the IO bar.
+
+For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual address.
+
+  * BAR0-1 registers: 64bit, prefetchable, GPU memory. 8GB or 16GB depending on Vega10 SKU. Must
+    be placed < 2^44 to support P2P  	access from other Vega10.
+  * BAR2-3 registers: 64bit, prefetchable, Doorbell. Must be placed \< 2^44 to support P2P access from
+    other Vega10.
+  * BAR4 register: Optional, not a boot device.
+  * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed \< 4GB.
+
+Here is how our base address register (BAR) works on GFX 8 GPUs with 40 bit Physical Address Limit ::
+
+  11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO
+  Series] (rev c1)
+
+  Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35
+
+  Flags: bus master, fast devsel, latency 0, IRQ 119
+
+  Memory at bf40000000 (64-bit, prefetchable) [size=256M]
+
+  Memory at bf50000000 (64-bit, prefetchable) [size=2M]
+
+  I/O ports at 3000 [size=256]
+
+  Memory at c7400000 (32-bit, non-prefetchable) [size=256K]
+
+  Expansion ROM at c7440000 [disabled] [size=128K]
+
+Legend:
+
+1 : GPU Frame Buffer BAR -- In this example it happens to be 256M, but typically this will be size of the
+GPU memory (typically 4GB+). This BAR has to be placed \< 2^40 to allow peer-to-peer access from
+other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed \< 2^44 to allow peer-to-peer
+access from other GFX9 AMD GPUs.
+
+2 : Doorbell BAR -- The size of the BAR is typically will be \< 10MB (currently fixed at 2MB) for this
+generation GPUs. This BAR has to be placed \< 2^40 to allow peer-to-peer access from other current
+generation AMD GPUs.
+
+3 : IO BAR -- This is for legacy VGA and boot device support, but since this the GPUs in this project are
+not VGA devices (headless), this is not a concern even if the SBIOS does not setup.
+
+4 : MMIO BAR -- This is required for the AMD Driver SW to access the configuration registers. Since the
+reminder of the BAR available is only 1 DWORD (32bit), this is placed \< 4GB. This is fixed at 256KB.
+
+5 : Expansion ROM -- This is required for the AMD Driver SW to access the GPU video-bios. This is
+currently fixed at 128KB.
+
+For more information, you can review
+`Overview of Changes to PCI Express 3.0 <https://www.mindshare.com/files/resources/PCIe%203-0.pdf>`_.
--- a/docs/examples/machine_learning/migraphx_optimization.md
+++ b/docs/examples/machine_learning/migraphx_optimization.md
@@ -1,36 +1,37 @@
-# Inference Optimization with MIGraphX
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Inference optimization with MIGraphX">
+  <meta name="keywords" content="Inference optimization, MIGraphX, deep-learning, MIGraphX
+  installation, AMD, ROCm">
+</head>

-The following sections cover inferencing and introduces MIGraphX.
+# Inference optimization with MIGraphX
+
+The following sections cover inferencing and introduces [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/).

 ## Inference

-The inference is where capabilities learned during Deep Learning training are put to work. It refers to using a fully trained neural network to make conclusions (predictions) on unseen data that the model has never interacted with before. Deep Learning inferencing is achieved by feeding new data, such as new images, to the network, giving the Deep Neural Network a chance to classify the image.
+The inference is where capabilities learned during deep-learning training are put to work. It refers to using a fully trained neural network to make conclusions (predictions) on unseen data that the model has never interacted with before. Deep-learning inferencing is achieved by feeding new data, such as new images, to the network, giving the Deep Neural Network a chance to classify the image.

 Taking our previous example of MNIST, the DNN can be fed new images of handwritten digit images, allowing the neural network to classify digits. A fully trained DNN should make accurate predictions about what an image represents, and inference cannot happen without training.

-## MIGraphX Introduction
+## MIGraphX introduction

-MIGraphX is a graph compiler focused on accelerating the Machine Learning inference that can target AMD GPUs and CPUs. MIGraphX accelerates the Machine Learning models by leveraging several graph-level transformations and optimizations. These optimizations include:
+MIGraphX is a graph compiler focused on accelerating the machine-learning inference that can target AMD GPUs and CPUs. MIGraphX accelerates the machine-learning models by leveraging several graph-level transformations and optimizations. These optimizations include:

- Operator fusion
-
- Arithmetic simplifications
-
- Dead-code elimination
-
- Common subexpression elimination (CSE)
-
- Constant propagation
+* Operator fusion
+* Arithmetic simplifications
+* Dead-code elimination
+* Common subexpression elimination (CSE)
+* Constant propagation

 After doing all these transformations, MIGraphX emits code for the AMD GPU by calling to MIOpen or rocBLAS or creating HIP kernels for a particular operator. MIGraphX can also target CPUs using DNNL or ZenDNN libraries.

-MIGraphX provides easy-to-use APIs in C++ and Python to import machine models in ONNX or TensorFlow. Users can compile, save, load, and run these models using MIGraphX's C++ and Python APIs. Internally, MIGraphX parses ONNX or TensorFlow models into internal graph representation where each operator in the model gets mapped to an operator within MIGraphX. Each of these operators defines various attributes such as:
+MIGraphX provides easy-to-use APIs in C++ and Python to import machine models in ONNX or TensorFlow. Users can compile, save, load, and run these models using the MIGraphX C++ and Python APIs. Internally, MIGraphX parses ONNX or TensorFlow models into internal graph representation where each operator in the model gets mapped to an operator within MIGraphX. Each of these operators defines various attributes such as:

- Number of arguments
-
- Type of arguments
-
- Shape of arguments
+* Number of arguments
+* Type of arguments
+* Shape of arguments

 After optimization passes, all these operators get mapped to different kernels on GPUs or CPUs.

@@ -40,7 +41,7 @@ After importing a model into MIGraphX, the model is represented as `migraphx::pr

 There are three options to get started with MIGraphX installation. MIGraphX depends on ROCm libraries; assume that the machine has ROCm installed.

-### Option 1: Installing Binaries
+### Option 1: installing binaries

 To install MIGraphX on Debian-based systems like Ubuntu, use the following command:

@@ -50,21 +51,21 @@ sudo apt update && sudo apt install -y migraphx

 The header files and libraries are installed under `/opt/rocm-\<version\>`, where \<version\> is the ROCm version.

-### Option 2: Building from Source
+### Option 2: building from source

 There are two ways to build the MIGraphX sources.

- [Use the ROCm build tool](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses [rbuild](https://github.com/RadeonOpenCompute/rbuild) to install the prerequisites and build the libraries with just one command.
+* [Use the ROCm build tool](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses `[rbuild](https://github.com/RadeonOpenCompute/rbuild)` to install the prerequisites and build the libraries with just one command.

  or

- [Use CMake](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.
+* [Use CMake](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.

 For detailed steps on building from source and installing dependencies, refer to the following `README` file:

 [https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source)

-### Option 3: Use Docker
+### Option 3: use docker

 To use Docker, follow these steps:

@@ -88,7 +89,7 @@ To use Docker, follow these steps:

 The Docker image contains all the prerequisites required for the installation, so users can go to the folder `/code/AMDMIGraphX` and follow the steps mentioned in [Option 2: Building from Source](#option-2-building-from-source).

-## MIGraphX Example
+## MIGraphX example

 MIGraphX provides both C++ and Python APIs. The following sections show examples of both using the Inception v3 model. To walk through the examples, fetch the Inception v3 ONNX model by running the following:

@@ -311,7 +312,7 @@ MIGraphX introduces a feature, known as YModel, that stores the kernel config pa

 The YModel feature is available starting from ROCm 5.4.1 and UIF 1.1.

-#### YModel Example
+#### YModel example

 Through the `migraphx-driver` functionality, you can generate `.mxr` files with tuning information stored inside it by passing additional `--binary --output model.mxr` to `migraphx-driver` along with the rest of the necessary flags.

@@ -327,12 +328,6 @@ To run generated `.mxr` files through `migraphx-driver`, use the following:
 ./path/to/migraphx-driver run --migraphx resnet50.mxr --enable-offload-copy
 ```

-Alternatively, you can use MIGraphX's C++ or Python API to generate `.mxr` file. Refer to {numref}`image018` for an example.
+Alternatively, you can use the MIGraphX C++ or Python API to generate `.mxr` files.

-```{figure} ../../data/understand/deep_learning/image.018.png
-:name: image018
---
-align: center
---
-Generating a `.mxr` File
-```
+![Generating an MXR file](../data/conceptual/image018.png "Generating an MXR file")
--- a/docs/examples/machine_learning/pytorch_inception.md
+++ b/docs/examples/machine_learning/pytorch_inception.md
@@ -1,19 +1,26 @@
-# Inception V3 with PyTorch
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Inception V3 with PyTorch">
+  <meta name="keywords" content="PyTorch, Inception V3, deep-learning, training data, optimization
+  algorithm, AMD, ROCm">
+</head>

-## Deep Learning Training
+# Deep learning: Inception V3 with PyTorch

-Deep Learning models are designed to capture the complexity of the problem and the underlying data. These models are "deep," comprising multiple component layers. Training is finding the best parameters for each model layer to achieve a well-defined objective.
+## Deep learning training
+
+Deep-learning models are designed to capture the complexity of the problem and the underlying data. These models are "deep," comprising multiple component layers. Training is finding the best parameters for each model layer to achieve a well-defined objective.

 The training data consists of input features in supervised learning, similar to what the learned model is expected to see during the evaluation or inference phase. The target output is also included, which serves to teach the model. A loss metric is defined as part of training that evaluates the model's performance during the training process.

 Training also includes the choice of an optimization algorithm that reduces the loss by adjusting the model's parameters. Training is an iterative process where training data is fed in, usually split into different batches, with the entirety of the training data passed during one training epoch. Training usually is run for multiple epochs.

-## Training Phases
+## Training phases

-Training occurs in multiple phases for every batch of training data. {numref}`TypesOfTrainingPhases` provides an explanation of the types of training phases.
+Training occurs in multiple phases for every batch of training data. the following table provides an explanation of the types of training phases.

 :::{table} Types of Training Phases
-:name: TypesOfTrainingPhases
+:name: training-phases
 :widths: auto
 | Types of Phases   |     |
 | ----------------- | --- |
@@ -23,10 +30,10 @@ Training occurs in multiple phases for every batch of training data. {numref}`Ty
 | Optimization Pass | The optimization algorithm updates the model parameters using the stored error gradients. |
 :::

-Training is different from inference, particularly from the hardware perspective. {numref}`TrainingVsInference` shows the contrast between training and inference.
+Training is different from inference, particularly from the hardware perspective. The following table shows the contrast between training and inference.

 :::{table} Training vs. Inference
-:name: TrainingVsInference
+:name: training-inference
 :widths: auto
 | Training | Inference |
 | ----------- | ----------- |
@@ -36,27 +43,27 @@ Training is different from inference, particularly from the hardware perspective
 | Data for training is available on the disk before the training process and is generally significant. The training performance is measured by how fast the data batches can be processed. | Inference data usually arrive stochastically, which may be batched to improve performance. Inference performance is generally measured in throughput speed to process the batch of data and the delay in responding to the input (latency). |
 :::

-Different quantization data types are typically chosen between training (FP32, BF16) and inference (FP16, INT8). The computation hardware has different specializations from other datatypes, leading to improvement in performance if a faster datatype can be selected for the corresponding task.
+Different quantization data types are typically chosen between training (FP32, BF16) and inference (FP16, INT8). The computation hardware has different specializations from other data types, leading to improvement in performance if a faster datatype can be selected for the corresponding task.

-## Case Studies
+## Case studies

-The following sections contain case studies for the Inception v3 model.
+The following sections contain case studies for the Inception V3 model.

-### Inception v3 with PyTorch
+### Inception V3 with PyTorch

-Convolution Neural Networks are forms of artificial neural networks commonly used for image processing. One of the core layers of such a network is the convolutional layer, which convolves the input with a weight tensor and passes the result to the next layer. Inception v3[^inception_arch] is an architectural development over the ImageNet competition-winning entry, AlexNet, using more profound and broader networks while attempting to meet computational and memory budgets.
+Convolution Neural Networks are forms of artificial neural networks commonly used for image processing. One of the core layers of such a network is the convolutional layer, which convolves the input with a weight tensor and passes the result to the next layer. Inception V3[^inception_arch] is an architectural development over the ImageNet competition-winning entry, AlexNet, using more profound and broader networks while attempting to meet computational and memory budgets.

-The implementation uses PyTorch as a framework. This case study utilizes `torchvision`[^torch_vision], a repository of popular datasets and model architectures, for obtaining the model. `torchvision` also provides pre-trained weights as a starting point to develop new models or fine-tune the model for a new task.
+The implementation uses PyTorch as a framework. This case study utilizes [TorchVision](https://pytorch.org/vision/stable/index.html), a repository of popular datasets and model architectures, for obtaining the model. TorchVision also provides pre-trained weights as a starting point to develop new models or fine-tune the model for a new task.

-#### Evaluating a Pre-Trained Model
+#### Evaluating a pre-trained model

-The Inception v3 model introduces a simple image classification task with the pre-trained model. This does not involve training but utilizes an already pre-trained model from `torchvision`.
+The Inception V3 model introduces a simple image classification task with the pre-trained model. This does not involve training but utilizes an already pre-trained model from TorchVision.

-This example is adapted from the PyTorch research hub page on Inception v3[^torch_vision_inception].
+This example is adapted from the PyTorch research hub page on [Inception V3](https://pytorch.org/vision/master/models/inception.html).

 Follow these steps:

-1. Run the PyTorch ROCm-based Docker image or refer to the section [Installing PyTorch](/how_to/pytorch_install/pytorch_install.md) for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker run -it -v $HOME:/data --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest
@@ -85,7 +92,7 @@ Follow these steps:
    except: urllib.request.urlretrieve(url, filename)
    ```

-5. Import `torchvision` and `PIL.Image` support libraries.
+5. Import torchvision and PILImage support libraries.

    ```py
    from PIL import Image
@@ -140,13 +147,13 @@ Follow these steps:
        print(categories[top5_catid[i]], top5_prob[i].item())
    ```

-#### Training Inception v3
+#### Training Inception V3

-The previous section focused on downloading and using the Inception v3 model for a simple image classification task. This section walks through training the model on a new dataset.
+The previous section focused on downloading and using the Inception V3 model for a simple image classification task. This section walks through training the model on a new dataset.

 Follow these steps:

-1. Run the PyTorch ROCm Docker image or refer to the section [Installing PyTorch](how_to/pytorch_install/pytorch_install.md) for setting up a PyTorch environment on ROCm.
+1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.

    ```dockerfile
    docker pull rocm/pytorch:latest
@@ -196,7 +203,7 @@ Follow these steps:

 5. Open a Python shell.

-6. Import dependencies, including `torch`, `os`, and `torchvision`.
+6. Import dependencies, including Torch, OS, and [TorchVision](https://github.com/pytorch/vision).

    ```py
    import torch
@@ -222,7 +229,7 @@ Follow these steps:
    data_path = "tiny-imagenet-200"
    ```

-    The training image size is cropped for input into Inception v3.
+    The training image size is cropped for input into Inception V3.

    ```py
    train_crop_size = 299
@@ -241,7 +248,7 @@ Follow these steps:
    val_resize_size = 342
    ```

-    The pre-trained Inception v3 model is chosen to be downloaded from `torchvision`.
+    The pre-trained Inception V3 model is chosen to be downloaded from torchvision.

    ```py
    model_name = "inception_v3"
@@ -334,7 +341,7 @@ Follow these steps:
    ```

    :::{note}
-    Use `torchvision` to obtain the Inception v3 model. Use the pre-trained model weights to speed up training.
+    Use torchvision to obtain the Inception V3 model. Use the pre-trained model weights to speed up training.
    :::

    ```py
@@ -343,7 +350,7 @@ Follow these steps:
    model = torchvision.models.__dict__[model_name](pretrained=pretrained)
    ```

-11. Adapt Inception v3 for the current dataset. `tiny-imagenet-200` contains only 200 classes, whereas Inception v3 is designed for 1,000-class output. The last layer of Inception v3 is replaced to match the output features required.
+11. Adapt Inception V3 for the current dataset. `tiny-imagenet-200` contains only 200 classes, whereas Inception V3 is designed for 1,000-class output. The last layer of Inception V3 is replaced to match the output features required.

    ```py
    model.fc = torch.nn.Linear(model.fc.in_features, len(dataset.classes))
@@ -461,23 +468,17 @@ Follow these steps:
 torch.save(model.state_dict(), "trained_inception_v3.pt")
 ```

-Plotting the train and test loss shows both metrics reducing over training epochs. This is demonstrated in {numref}`inceptionV3`.
+Plotting the train and test loss shows both metrics reducing over training epochs. This is demonstrated in the following image.

-```{figure} ../../data/understand/deep_learning/inception_v3.png
-:name: inceptionV3
---
-align: center
---
-Inception v3 Train and Loss Graph
-```
+![Inception V3 train and loss graph](../data/conceptual/inception-v3.png "Inception V3 train and loss")

-### Custom Model with CIFAR-10 on PyTorch
+### Custom model with CIFAR-10 on PyTorch

-The CIFAR-10 (Canadian Institute for Advanced Research) dataset is a subset of the Tiny Images dataset (which contains 80 million images of 32x32 collected from the Internet) and consists of 60,000 32x32 color images. The images are labeled with one of 10 mutually exclusive classes: airplane, motor car, bird, cat, deer, dog, frog, cruise ship, stallion, and truck (but not pickup truck). There are 6,000 images per class, with 5,000 training and 1,000 testing images per class. Let us prepare a custom model for classifying these images using the PyTorch framework and go step-by-step as illustrated below.
+The Canadian Institute for Advanced Research (CIFAR)-10 dataset is a subset of the Tiny Images dataset (which contains 80 million images of 32x32 collected from the Internet) and consists of 60,000 32x32 color images. The images are labeled with one of 10 mutually exclusive classes: airplane, motor car, bird, cat, deer, dog, frog, cruise ship, stallion, and truck (but not pickup truck). There are 6,000 images per class, with 5,000 training and 1,000 testing images per class. Let us prepare a custom model for classifying these images using the PyTorch framework and go step-by-step as illustrated below.

 Follow these steps:

-1. Import dependencies, including `torch`, `os`, and `torchvision`.
+1. Import dependencies, including Torch, OS, and [TorchVision](https://github.com/pytorch/vision).

    ```py
    import torch
@@ -487,7 +488,7 @@ Follow these steps:
    import numpy as np
    ```

-2. The output of `torchvision` datasets is `PILImage` images of range [0, 1]. Transform them to Tensors of normalized range [-1, 1].
+2. The output of torchvision datasets is `PILImage` images of range [0, 1]. Transform them to Tensors of normalized range [-1, 1].

    ```py
    transform = transforms.Compose(
@@ -668,13 +669,13 @@ Follow these steps:
        print("Accuracy for class {:5s} is: {:.1f} %".format(classname,accuracy))
    ```

-### Case Study: TensorFlow with Fashion MNIST
+### Case study: TensorFlow with Fashion-MNIST

-Fashion MNIST is a dataset that contains 70,000 grayscale images in 10 categories.
+Fashion-MNIST is a dataset that contains 70,000 grayscale images in 10 categories.

 Implement and train a neural network model using the TensorFlow framework to classify images of clothing, like sneakers and shirts.

-The dataset has 60,000 images you will use to train the network and 10,000 to evaluate how accurately the network learned to classify images. The Fashion MNIST dataset can be accessed via TensorFlow internal libraries.
+The dataset has 60,000 images you will use to train the network and 10,000 to evaluate how accurately the network learned to classify images. The Fashion-MNIST dataset can be accessed via TensorFlow internal libraries.

 Access the source code from the following repository:

@@ -696,7 +697,7 @@ To understand the code step by step, follow these steps:
    print(tf._version__) r
    ```

-3. Load the dataset from the available internal libraries to analyze and train a neural network upon the MNIST Fashion Dataset. Loading the dataset returns four NumPy arrays. The model uses the training set arrays, train_images and train_labels, to learn.
+3. Load the dataset from the available internal libraries to analyze and train a neural network upon the Fashion-MNIST dataset. Loading the dataset returns four NumPy arrays. The model uses the training set arrays, train_images and train_labels, to learn.

 4. The model is tested against the test set, test_images, and test_labels arrays.

@@ -741,11 +742,7 @@ To understand the code step by step, follow these steps:
    plt.show()
    ```

-    ```{figure} ../../data/understand/deep_learning/mnist_1.png
-    ---
-    align: center
-    ---
-    ```
+    ![ ](../data/conceptual/mnist-1.png)

 10. From the above picture, you can see that values are from zero to 255. Before training this on the neural network, you must bring them in the range of zero to one. Hence, divide the values by 255.

@@ -769,13 +766,9 @@ To understand the code step by step, follow these steps:
    plt.show()
    ```

-    ```{figure} ../../data/understand/deep_learning/mnist_2.png
-    ---
-    align: center
-    ---
-    ```
+    ![ ](../data/conceptual/mnist-2.png)

-    The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Deep Learning consists of chaining together simple layers. Most layers, such as `tf.keras.layers.Dense`, have parameters that are learned during training.
+    The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Deep learning consists of chaining together simple layers. Most layers, such as `tf.keras.layers.Dense`, have parameters that are learned during training.

    ```py
    model = tf.keras.Sequential([
@@ -785,9 +778,9 @@ To understand the code step by step, follow these steps:
    ])
    ```

-    - The first layer in this network `tf.keras.layers.Flatten` transforms the format of the images from a two-dimensional array (of 28 x 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). Think of this layer as unstacking rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data.
+    * The first layer in this network `tf.keras.layers.Flatten` transforms the format of the images from a two-dimensional array (of 28 x 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). Think of this layer as unstacking rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data.

-    - After the pixels are flattened, the network consists of a sequence of two `tf.keras.layers.Dense` layers. These are densely connected or fully connected neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with a length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes.
+    * After the pixels are flattened, the network consists of a sequence of two `tf.keras.layers.Dense` layers. These are densely connected or fully connected neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with a length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes.

 12. You must add the Loss function, Metrics, and Optimizer at the time of model compilation.

@@ -797,11 +790,11 @@ To understand the code step by step, follow these steps:
                metrics=['accuracy'])
    ```

-    - Loss function —This measures how accurate the model is during training when you are looking to minimize this function to "steer" the model in the right direction.
+    * Loss function —This measures how accurate the model is during training when you are looking to minimize this function to "steer" the model in the right direction.

-    - Optimizer —This is how the model is updated based on the data it sees and its loss function.
+    * Optimizer —This is how the model is updated based on the data it sees and its loss function.

-    - Metrics —This is used to monitor the training and testing steps.
+    * Metrics —This is used to monitor the training and testing steps.

    The following example uses accuracy, the fraction of the correctly classified images.

@@ -895,11 +888,7 @@ To understand the code step by step, follow these steps:
        plt.show()
        ```

-        ```{figure} ../../data/understand/deep_learning/mnist_3.png
-        ---
-        align: center
-        ---
-        ```
+        ![ ](../data/conceptual/mnist-3.png)

        ```py
        i = 12
@@ -911,11 +900,7 @@ To understand the code step by step, follow these steps:
        plt.show()
        ```

-        ```{figure} ../../data/understand/deep_learning/mnist_4.png
-        ---
-        align: center
-        ---
-        ```
+        ![ ](../data/conceptual/mnist-4.png)

    10. Use the trained model to predict a single image.

@@ -946,11 +931,7 @@ To understand the code step by step, follow these steps:
        plt.show()
        ```

-        ```{figure} ../../data/understand/deep_learning/mnist_5.png
-        ---
-        align: center
-        ---
-        ```
+        ![ ](../data/conceptual/mnist-5.png)

    13. `tf.keras.Model.predict` returns a list of lists—one for each image in the batch of data. Grab the predictions for our (only) image in the batch.

@@ -958,7 +939,7 @@ To understand the code step by step, follow these steps:
        np.argmax(predictions_single[0])
        ```

-### Case Study: TensorFlow with Text Classification
+### Case study: TensorFlow with text classification

 This procedure demonstrates text classification starting from plain text files stored on disk. You will train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try in which you will train a multi-class classifier to predict the tag for a programming question on Stack Overflow.

@@ -988,7 +969,7 @@ Follow these steps:
                                        cache_subdir='')
    ```

-    ```py
+    ```bash
    Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    84131840/84125825 [==============================] – 1s 0us/step
    84149932/84125825 [==============================] – 1s 0us/step
@@ -1115,11 +1096,7 @@ To prepare the data for training, follow these steps:
    print("Vectorized review", vectorize_text(first_review, first_label))
    ```

-    ```{figure} ../../data/understand/deep_learning/TextClassification_3.png
-    ---
-    align: center
-    ---
-    ```
+    ![ ](../data/conceptual/TextClassification-3.png)

 5. As you can see above, each token has been replaced by an integer. Look up the token (string) that each integer corresponds to by calling get_vocabulary() on the layer.

@@ -1158,11 +1135,7 @@ To prepare the data for training, follow these steps:
    model.summary()
    ```

-    ```{figure} ../../data/understand/deep_learning/TextClassification_4.png
-    ---
-    align: center
-    ---
-    ```
+    ![ ](../data/conceptual/TextClassification-4.png)

 8. A model needs a loss function and an optimizer for training. Since this is a binary classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), use [`losses.BinaryCrossentropy`](https://www.tensorflow.org/api_docs/python/tf/keras/losses/BinaryCrossentropy) loss function.

@@ -1178,11 +1151,7 @@ To prepare the data for training, follow these steps:
    history = model.fit(train_ds,validation_data=val_ds,epochs=epochs)
    ```

-    ```{figure} ../../data/understand/deep_learning/TextClassification_5.png
-    ---
-    align: center
-    ---
-    ```
+    ![ ](../data/conceptual/TextClassification-5.png)

 10. See how the model performs. Two values are returned: loss (a number representing our error; lower values are better) and accuracy.

@@ -1194,7 +1163,8 @@ To prepare the data for training, follow these steps:
    ```

    :::{note}
-    model.fit() returns a History object that contains a dictionary with everything that happened during training.
+    `model.fit()` returns a History object that contains a dictionary with everything that happened during
+    training.
    :::

    ```py
@@ -1224,23 +1194,11 @@ To prepare the data for training, follow these steps:
    plt.show()
    ```

-    {numref}`TextClassification6` and {numref}`TextClassification7` illustrate the training and validation loss and the training and validation accuracy.
+    The following images illustrate the training and validation loss and the training and validation accuracy.

-    ```{figure} ../../data/understand/deep_learning/TextClassification_6.png
-    :name: TextClassification6
-    ---
-    align: center
-    ---
-    Training and Validation Loss
-    ```
+    ![Training and validation loss](../data/conceptual/TextClassification-6.png "Training and validation loss")

-    ```{figure} ../../data/understand/deep_learning/TextClassification_7.png
-    :name: TextClassification7
-    ---
-    align: center
-    ---
-    Training and Validation Accuracy
-    ```
+    ![Training and validation accuracy](../data/conceptual/TextClassification-7.png "Training and validation accuracy")

 12. Export the model.

@@ -1271,15 +1229,3 @@ To prepare the data for training, follow these steps:

    export_model.predict(examples)
    ```
-
-## References
-
-[^inception_arch]: C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens and Z. Wojna, "Rethinking the Inception Architecture for Computer Vision," CoRR, p. abs/1512.00567, 2015
-
-[^torch_vision]: PyTorch, \[Online\]. Available: [https://pytorch.org/vision/stable/index.html](https://pytorch.org/vision/stable/index.html)
-
-[^torch_vision_inception]: PyTorch, \[Online\]. Available: [https://pytorch.org/hub/pytorch_vision_inception_v3/](https://pytorch.org/hub/pytorch_vision_inception_v3/)
-
-[^Stanford_deep_learning]: Stanford, \[Online\]. Available: [http://cs231n.stanford.edu/](http://cs231n.stanford.edu/)
-
-[^cross_entropy]: Wikipedia, \[Online\]. Available: [https://en.wikipedia.org/wiki/Cross_entropy](https://en.wikipedia.org/wiki/Cross_entropy)
--- a/docs/conceptual/cmake-packages.rst
+++ b/docs/conceptual/cmake-packages.rst
@@ -1,48 +1,54 @@
-***********
+.. meta::
+   :description: Using CMake
+   :keywords: CMake, dependencies, HIP, C++, AMD, ROCm
+
+*********************************
 Using CMake
-***********
+*********************************

 Most components in ROCm support CMake. Projects depending on header-only or
 library components typically require CMake 3.5 or higher whereas those wanting
-to make use of CMake's HIP language support will require CMake 3.21 or higher.
+to make use of the CMake HIP language support will require CMake 3.21 or higher.

-Finding Dependencies
+Finding dependencies
 ====================

 .. note::
-   For a complete
-   reference on how to deal with dependencies in CMake, refer to the CMake docs
-   on `find_package
-   <https://cmake.org/cmake/help/latest/command/find_package.html>`_ and the
-   `Using Dependencies Guide
-   <https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html>`_
-   to get an overview of CMake's related facilities.
+
+  For a complete
+  reference on how to deal with dependencies in CMake, refer to the CMake docs
+  on `find_package
+  <https://cmake.org/cmake/help/latest/command/find_package.html>`_ and the
+  `Using Dependencies Guide
+  <https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html>`_
+  to get an overview of CMake related facilities.

 In short, CMake supports finding dependencies in two ways:

-  In Module mode, it consults a file ``Find<PackageName>.cmake`` which tries to
-   find the component in typical install locations and layouts. CMake ships a
-   few dozen such scripts, but users and projects may ship them as well.
-  In Config mode, it locates a file named ``<packagename>-config.cmake`` or
-   ``<PackageName>Config.cmake`` which describes the installed component in all
-   regards needed to consume it.
+* In Module mode, it consults a file ``Find<PackageName>.cmake`` which tries to find the component
+  in typical install locations and layouts. CMake ships a few dozen such scripts, but users and projects
+  may ship them as well.
+
+* In Config mode, it locates a file named ``<packagename>-config.cmake`` or
+  ``<PackageName>Config.cmake`` which describes the installed component in all regards needed to
+  consume it.

 ROCm predominantly relies on Config mode, one notable exception being the Module
-driving the compilation of HIP programs on Nvidia runtimes. As such, when
+driving the compilation of HIP programs on NVIDIA runtimes. As such, when
 dependencies are not found in standard system locations, one either has to
 instruct CMake to search for package config files in additional folders using
-the ``CMAKE_PREFIX_PATH`` variable (a semi-colon separated list of filesystem
+the ``CMAKE_PREFIX_PATH`` variable (a semi-colon separated list of file system
 paths), or using ``<PackageName>_ROOT`` variable on a project-specific basis.

 There are nearly a dozen ways to set these variables. One may be more convenient
 over the other depending on your workflow. Conceptually the simplest is adding
-it to your CMake configuration command on the command-line via
+it to your CMake configuration command on the command line via
 ``-D CMAKE_PREFIX_PATH=....`` . AMD packaged ROCm installs can typically be
 added to the config file search paths such as:

-  Windows: ``-D CMAKE_PREFIX_PATH=${env:HIP_PATH}``
+*  Windows: ``-D CMAKE_PREFIX_PATH=${env:HIP_PATH}``

-  Linux: ``-D CMAKE_PREFIX_PATH=/opt/rocm``
+*  Linux: ``-D CMAKE_PREFIX_PATH=/opt/rocm``

 ROCm provides the respective *config-file* packages, and this enables
 ``find_package`` to be used directly. ROCm does not require any Find module as
@@ -50,13 +56,16 @@ the *config-file* packages are shipped with the upstream projects, such as
 rocPRIM and other ROCm libraries.

 For a complete guide on where and how ROCm may be installed on a system, refer
-to the installation guides in these docs (`Linux <../deploy/linux/index.html>`_).
+to the installation guides for
+`Linux <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html>`_
+and
+`Windows <https://rocm.docs.amd.com/projects/install-on-windows/en/latest/index.html>`_.

 Using HIP in CMake
 ==================

-ROCm componenents providing a C/C++ interface support being consumed using any
-C/C++ toolchain that CMake knows how to drive. ROCm also supports CMake's HIP
+ROCm components providing a C/C++ interface support consumption via any
+C/C++ toolchain that CMake knows how to drive. ROCm also supports the CMake HIP
 language features, allowing users to program using the HIP single-source
 programming model. When a program (or translation-unit) uses the HIP API without
 compiling any GPU device code, HIP can be treated in CMake as a simple C/C++
@@ -69,22 +78,22 @@ Source code written in the HIP dialect of C++ typically uses the `.hip`
 extension. When the HIP CMake language is enabled, it will automatically
 associate such source files with the HIP toolchain being used.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
-    cmake_policy(VERSION 3.21.3...3.27)
-    project(MyProj LANGUAGES HIP)
-    add_executable(MyApp Main.hip)
+  cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
+  cmake_policy(VERSION 3.21.3...3.27)
+  project(MyProj LANGUAGES HIP)
+  add_executable(MyApp Main.hip)

 Should you have existing CUDA code that is from the source compatible subset of
 HIP, you can tell CMake that despite their `.cu` extension, they're HIP sources.
 Do note that this mostly facilitates compiling kernel code-only source files,
 as host-side CUDA API won't compile in this fashion.

-::
+.. code-block:: cmake

-    add_library(MyLib MyLib.cu)
-    set_source_files_properties(MyLib.cu PROPERTIES LANGUAGE HIP)
+  add_library(MyLib MyLib.cu)
+  set_source_files_properties(MyLib.cu PROPERTIES LANGUAGE HIP)

 CMake itself only hosts part of the HIP language support, such as defining
 HIP-specific properties, etc. while the other half ships with the HIP
@@ -96,12 +105,16 @@ there's a catch-all, last resort variable consulted locating this file,
 ``-D CMAKE_HIP_COMPILER_ROCM_ROOT:PATH=`` which should be set the root of the
 ROCm installation.

+.. note::
+    Imported targets defined by `hip-lang-config.cmake` are for internal use
+    only.
+
 If the user doesn't provide a semi-colon delimited list of device architectures
 via ``CMAKE_HIP_ARCHITECTURES``, CMake will select some sensible default. It is
 advised though that if a user knows what devices they wish to target, then set
 this variable explicitly.

-Consuming ROCm C/C++ Libraries
+Consuming ROCm C/C++ libraries
 ------------------------------

 Libraries such as rocBLAS, rocFFT, MIOpen, etc. behave as C/C++ libraries.
@@ -109,45 +122,57 @@ Illustrated in the example below is a C++ application using MIOpen from CMake.
 It calls ``find_package(miopen)``, which provides the ``MIOpen`` imported
 target. This can be linked with ``target_link_libraries``

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.5) # find_package(miopen) requires 3.5
-    cmake_policy(VERSION 3.5...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(miopen)
-    add_library(MyLib ...)
-    target_link_libraries(MyLib PUBLIC MIOpen)
+  cmake_minimum_required(VERSION 3.5) # find_package(miopen) requires 3.5
+  cmake_policy(VERSION 3.5...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(miopen)
+  add_library(MyLib ...)
+  target_link_libraries(MyLib PUBLIC MIOpen)

 .. note::
-    Most libraries are designed as host-only API, so using a GPU device
-    compiler is not necessary for downstream projects unless they use GPU device
-    code.
+
+  Most libraries are designed as host-only API, so using a GPU device
+  compiler is not necessary for downstream projects unless they use GPU device
+  code.

 Consuming the HIP API in C++ code
 ---------------------------------

-Use the HIP API without compiling the GPU device code. As there is no GPU code,
-any C or C++ compiler can be used. The ``find_package(hip)`` provides the
-``hip::host`` imported target to use HIP in this context.
+Consuming the HIP API without compiling single-source GPU device code can be
+done using any C++ compiler. The ``find_package(hip)`` provides the
+``hip::host`` imported target to use HIP in this scenario.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.5) # find_package(hip) requires 3.5
-    cmake_policy(VERSION 3.5...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(hip REQUIRED)
-    add_executable(MyApp ...)
-    target_link_libraries(MyApp PRIVATE hip::host)
+  cmake_minimum_required(VERSION 3.5) # find_package(hip) requires 3.5
+  cmake_policy(VERSION 3.5...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(hip REQUIRED)
+  add_executable(MyApp ...)
+  target_link_libraries(MyApp PRIVATE hip::host)
+
+When mixing such ``CXX`` sources with ``HIP`` sources holding device-code, link
+only to `hip::host`. If HIP sources don't have `.hip` as their extension, use
+`set_source_files_properties(<hip_sources>... PROPERTIES LANGUAGE HIP)` on them.
+Linking to `hip::host` will set all the necessary flags for the ``CXX`` sources
+while ``HIP`` sources inherit all flags from the built-in language support.
+Having HIP sources in a target will turn the |LINK_LANG|_ into ``HIP``.
+
+.. |LINK_LANG| replace:: ``LINKER_LANGUAGE``
+.. _LINK_LANG: https://cmake.org/cmake/help/latest/prop_tgt/LINKER_LANGUAGE.html

 Compiling device code in C++ language mode
 ------------------------------------------

 .. attention::
-    The workflow detailed here is considered legacy and is shown for
-    understanding's sake. It pre-dates the existence of HIP language support in
-    CMake. If source code has HIP device code in it, it is a HIP source file
-    and should be compiled as such. Only resort to the method below if your
-    HIP-enabled CMake codepath can't mandate CMake version 3.21.
+
+  The workflow detailed here is considered legacy and is shown for
+  understanding's sake. It pre-dates the existence of HIP language support in
+  CMake. If source code has HIP device code in it, it is a HIP source file
+  and should be compiled as such. Only resort to the method below if your
+  HIP-enabled CMake code path can't mandate CMake version 3.21.

 If code uses the HIP API and compiles GPU device code, it requires using a
 device compiler. The compiler for CMake can be set using either the
@@ -159,20 +184,21 @@ compiler that supports AMD GPU targets, which is usually Clang.
 The ``find_package(hip)`` provides the ``hip::device`` imported target to add
 all the flags necessary for device compilation.

-::
+.. code-block:: cmake

-    cmake_minimum_required(VERSION 3.8) # cxx_std_11 requires 3.8
-    cmake_policy(VERSION 3.8...3.27)
-    project(MyProj LANGUAGES CXX)
-    find_package(hip REQUIRED)
-    add_library(MyLib ...)
-    target_link_libraries(MyLib PRIVATE hip::device)
-    target_compile_features(MyLib PRIVATE cxx_std_11)
+  cmake_minimum_required(VERSION 3.8) # cxx_std_11 requires 3.8
+  cmake_policy(VERSION 3.8...3.27)
+  project(MyProj LANGUAGES CXX)
+  find_package(hip REQUIRED)
+  add_library(MyLib ...)
+  target_link_libraries(MyLib PRIVATE hip::device)
+  target_compile_features(MyLib PRIVATE cxx_std_11)

 .. note::
-    Compiling for the GPU device requires at least C++11.

-This project can then be configured with for eg.
+  Compiling for the GPU device requires at least C++11.
+
+This project can then be configured with the following CMake commands.

 -  Windows: ``cmake -D CMAKE_CXX_COMPILER:PATH=${env:HIP_PATH}\bin\clang++.exe``

@@ -182,13 +208,13 @@ Which use the device compiler provided from the binary packages of
 `ROCm HIP SDK <https://www.amd.com/en/developer/rocm-hub.html>`_ and
 `repo.radeon.com <https://repo.radeon.com>`_ respectively.

-When using the CXX language support to compile HIP device code, selecting the
+When using the ``CXX`` language support to compile HIP device code, selecting the
 target GPU architectures is done via setting the ``GPU_TARGETS`` variable.
 ``CMAKE_HIP_ARCHITECTURES`` only exists when the HIP language is enabled. By
 default, this is set to some subset of the currently supported architectures of
-AMD ROCm. It can be set to eg. ``-D GPU_TARGETS="gfx1032;gfx1035"``.
+AMD ROCm. It can be set to the CMake option ``-D GPU_TARGETS="gfx1032;gfx1035"``.

-ROCm CMake Packages
+ROCm CMake packages
 -------------------

 +-----------+----------+--------------------------------------------------------+
@@ -229,10 +255,10 @@ ROCm CMake Packages
 |           |          | ``migraphx::migraphx_onnx``, ``migraphx::migraphx_tf`` |
 +-----------+----------+--------------------------------------------------------+

-Using CMake Presets
+Using CMake presets
 ===================

-CMake command-lines depending on how specific users like to be when compiling
+CMake command lines depending on how specific users like to be when compiling
 code can grow to unwieldy lengths. This is the primary reason why projects tend
 to bake script snippets into their build definitions controlling compiler
 warning levels, changing CMake defaults (``CMAKE_BUILD_TYPE`` or
@@ -251,13 +277,12 @@ options.

 IDEs supporting CMake (Visual Studio, Visual Studio Code, CLion, etc.) all came
 up with their own way to register command-line fragments of different purpose in
-a setup'n'forget fashion for quick assembly using graphical front-ends. This is
+a setup-and-forget fashion for quick assembly using graphical front-ends. This is
 all nice, but configurations aren't portable, nor can they be reused in
-Continuous Intergration (CI) pipelines. CMake has condensed existing practice
+Continuous Integration (CI) pipelines. CMake has condensed existing practice
 into a portable JSON format that works in all IDEs and can be invoked from any
-command-line. This is
-`CMake Presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_
-.
+command line. This is
+`CMake Presets <https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html>`_.

 There are two types of preset files: one supplied by the project, called
 ``CMakePresets.json`` which is meant to be committed to version control,
@@ -274,109 +299,110 @@ Following is an example ``CMakeUserPresets.json`` file which actually compiles
 the `amd/rocm-examples <https://github.com/amd/rocm-examples>`_ suite of sample
 applications on a typical ROCm installation:

-::
+.. code-block:: json

-    {
-      "version": 3,
-      "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 21,
-        "patch": 0
+  {
+    "version": 3,
+    "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 21,
+      "patch": 0
+    },
+    "configurePresets": [
+      {
+        "name": "layout",
+        "hidden": true,
+        "binaryDir": "${sourceDir}/build/${presetName}",
+        "installDir": "${sourceDir}/install/${presetName}"
      },
-      "configurePresets": [
-        {
-          "name": "layout",
-          "hidden": true,
-          "binaryDir": "${sourceDir}/build/${presetName}",
-          "installDir": "${sourceDir}/install/${presetName}"
-        },
-        {
-          "name": "generator-ninja-multi-config",
-          "hidden": true,
-          "generator": "Ninja Multi-Config"
-        },
-        {
-          "name": "toolchain-makefiles-c/c++-amdclang",
-          "hidden": true,
-          "cacheVariables": {
-            "CMAKE_C_COMPILER": "/opt/rocm/bin/amdclang",
-            "CMAKE_CXX_COMPILER": "/opt/rocm/bin/amdclang++",
-            "CMAKE_HIP_COMPILER": "/opt/rocm/bin/amdclang++"
-          }
-        },
-        {
-          "name": "clang-strict-iso-high-warn",
-          "hidden": true,
-          "cacheVariables": {
-            "CMAKE_C_FLAGS": "-Wall -Wextra -pedantic",
-            "CMAKE_CXX_FLAGS": "-Wall -Wextra -pedantic",
-            "CMAKE_HIP_FLAGS": "-Wall -Wextra -pedantic"
-          }
-        },
-        {
-          "name": "ninja-mc-rocm",
-          "displayName": "Ninja Multi-Config ROCm",
-          "inherits": [
-            "layout",
-            "generator-ninja-multi-config",
-            "toolchain-makefiles-c/c++-amdclang",
-            "clang-strict-iso-high-warn"
-          ]
+      {
+        "name": "generator-ninja-multi-config",
+        "hidden": true,
+        "generator": "Ninja Multi-Config"
+      },
+      {
+        "name": "toolchain-makefiles-c/c++-amdclang",
+        "hidden": true,
+        "cacheVariables": {
+          "CMAKE_C_COMPILER": "/opt/rocm/bin/amdclang",
+          "CMAKE_CXX_COMPILER": "/opt/rocm/bin/amdclang++",
+          "CMAKE_HIP_COMPILER": "/opt/rocm/bin/amdclang++"
        }
-      ],
-      "buildPresets": [
-        {
-          "name": "ninja-mc-rocm-debug",
-          "displayName": "Debug",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm"
-        },
-        {
-          "name": "ninja-mc-rocm-release",
-          "displayName": "Release",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm"
-        },
-        {
-          "name": "ninja-mc-rocm-debug-verbose",
-          "displayName": "Debug (verbose)",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm",
-          "verbose": true
-        },
-        {
-          "name": "ninja-mc-rocm-release-verbose",
-          "displayName": "Release (verbose)",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm",
-          "verbose": true
+      },
+      {
+        "name": "clang-strict-iso-high-warn",
+        "hidden": true,
+        "cacheVariables": {
+          "CMAKE_C_FLAGS": "-Wall -Wextra -pedantic",
+          "CMAKE_CXX_FLAGS": "-Wall -Wextra -pedantic",
+          "CMAKE_HIP_FLAGS": "-Wall -Wextra -pedantic"
        }
-      ],
-      "testPresets": [
-        {
-          "name": "ninja-mc-rocm-debug",
-          "displayName": "Debug",
-          "configuration": "Debug",
-          "configurePreset": "ninja-mc-rocm",
-          "execution": {
-            "jobs": 0
-          }
-        },
-        {
-          "name": "ninja-mc-rocm-release",
-          "displayName": "Release",
-          "configuration": "Release",
-          "configurePreset": "ninja-mc-rocm",
-          "execution": {
-            "jobs": 0
-          }
+      },
+      {
+        "name": "ninja-mc-rocm",
+        "displayName": "Ninja Multi-Config ROCm",
+        "inherits": [
+          "layout",
+          "generator-ninja-multi-config",
+          "toolchain-makefiles-c/c++-amdclang",
+          "clang-strict-iso-high-warn"
+        ]
+      }
+    ],
+    "buildPresets": [
+      {
+        "name": "ninja-mc-rocm-debug",
+        "displayName": "Debug",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm"
+      },
+      {
+        "name": "ninja-mc-rocm-release",
+        "displayName": "Release",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm"
+      },
+      {
+        "name": "ninja-mc-rocm-debug-verbose",
+        "displayName": "Debug (verbose)",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm",
+        "verbose": true
+      },
+      {
+        "name": "ninja-mc-rocm-release-verbose",
+        "displayName": "Release (verbose)",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm",
+        "verbose": true
+      }
+    ],
+    "testPresets": [
+      {
+        "name": "ninja-mc-rocm-debug",
+        "displayName": "Debug",
+        "configuration": "Debug",
+        "configurePreset": "ninja-mc-rocm",
+        "execution": {
+          "jobs": 0
        }
-      ]
-    }
+      },
+      {
+        "name": "ninja-mc-rocm-release",
+        "displayName": "Release",
+        "configuration": "Release",
+        "configurePreset": "ninja-mc-rocm",
+        "execution": {
+          "jobs": 0
+        }
+      }
+    ]
+  }

 .. note::
-    Getting presets to work reliably on Windows requires some CMake improvements
-    and/or support from compiler vendors. (Refer to 
-    `Add support to the Visual Studio generators <https://gitlab.kitware.com/cmake/cmake/-/issues/24245>`_
-    and `Sourcing environment scripts <https://gitlab.kitware.com/cmake/cmake/-/issues/21619>`_
-    .)
+
+  Getting presets to work reliably on Windows requires some CMake improvements
+  and/or support from compiler vendors. (Refer to
+  `Add support to the Visual Studio generators <https://gitlab.kitware.com/cmake/cmake/-/issues/24245>`_
+  and `Sourcing environment scripts <https://gitlab.kitware.com/cmake/cmake/-/issues/21619>`_
+  .)
--- a/docs/conceptual/compiler-disambiguation.md
+++ b/docs/conceptual/compiler-disambiguation.md
@@ -1,15 +1,21 @@
-# ROCm Compilers Disambiguation
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm compilers disambiguation">
+  <meta name="keywords" content="compilers, compiler naming, AMD, ROCm">
+</head>
+
+# ROCm compilers disambiguation

 ROCm ships multiple compilers of varying origins and purposes. This article
 disambiguates compiler naming used throughout the documentation.

-## Compiler Terms
+## Compiler terms

 | Term | Description |
 | - | - |
 | `amdclang++` | Clang/LLVM-based compiler that is part of `rocm-llvm` package. The source code is available at <a href="https://github.com/RadeonOpenCompute/llvm-project" target="_blank">https://github.com/RadeonOpenCompute/llvm-project</a>. |
 | AOCC | Closed-source clang-based compiler that includes additional CPU optimizations. Offered as part of ROCm via the `rocm-llvm-alt` package. See for details, <a href="https://developer.amd.com/amd-aocc/" target="_blank">https://developer.amd.com/amd-aocc/</a>. |
 | HIP-Clang | Informal term for the `amdclang++` compiler |
-| HIPify | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm-Developer-Tools/HIPIFY" target="_blank">https://github.com/ROCm-Developer-Tools/HIPIFY</a> |
+| HIPIFY | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm-Developer-Tools/HIPIFY" target="_blank">https://github.com/ROCm-Developer-Tools/HIPIFY</a> |
 | `hipcc` | HIP compiler driver. A utility that invokes `clang` or `nvcc` depending on the target and passes the appropriate include and library options for the target compiler and HIP infrastructure. The source code is available at <a href="https://github.com/ROCm-Developer-Tools/HIPCC" target="_blank">https://github.com/ROCm-Developer-Tools/HIPCC</a>. |
 | ROCmCC | Clang/LLVM-based compiler. ROCmCC in itself is not a binary but refers to the overall compiler. |
--- a/docs/conceptual/file-reorg.md
+++ b/docs/conceptual/file-reorg.md
@@ -1,10 +1,17 @@
-# ROCm FHS Reorganization
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm Linux Filesystem Hierarchy Standard reorganization">
+  <meta name="keywords" content="FHS, Linux Filesystem Hierarchy Standard, directory structure,
+  AMD, ROCm">
+</head>
+
+# ROCm Linux Filesystem Hierarchy Standard reorganization

 ## Introduction

-The ROCm platform has adopted the Linux foundation Filesystem Hierarchy Standard (FHS) [https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html) in order to to ensure ROCm is consistent with standard open source conventions. The following sections specify how current and future releases of ROCm adhere to FHS, how the previous ROCm filesystem is supported, and how improved versioning specifications are applied to ROCm.
+The ROCm Software has adopted the Linux Filesystem Hierarchy Standard (FHS) [https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html) in order to to ensure ROCm is consistent with standard open source conventions. The following sections specify how current and future releases of ROCm adhere to FHS, how the previous ROCm file system is supported, and how improved versioning specifications are applied to ROCm.

-## Adopting the Linux foundation Filesystem Hierarchy Standard (FHS)
+## Adopting the FHS

 In order to standardize ROCm directory structure and directory content layout ROCm has adopted the [FHS](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html), adhering to open source conventions for Linux-based distribution. FHS ensures internal consistency within the ROCm stack, as well as external consistency with other systems and distributions. The ROCm proposed file structure is outlined below:

@@ -44,7 +51,7 @@ In order to standardize ROCm directory structure and directory content layout RO
              | -- architecture independent misc files
 ```

-## Changes From Earlier ROCm Versions
+## Changes from earlier ROCm versions

 The following table provides a brief overview of the new ROCm FHS layout, compared to the layout of earlier ROCm versions. Note that /opt/ is used to denote the default rocm-installation-path and should be replaced in case of a non-standard installation location of the ROCm distribution.

@@ -73,11 +80,11 @@ The following table provides a brief overview of the new ROCm FHS layout, compar
 |______________________________________________________|
 ```

-## ROCm FHS Reorganization: Backward Compatibility
+## ROCm FHS reorganization: backward compatibility

 The FHS file organization for ROCm was first introduced in the release of ROCm 5.2 . Backward compatibility was implemented to make sure users could still run their ROCm applications while transitioning to the new FHS. ROCm has moved header files and libraries to their new locations as indicated in the above structure, and included symbolic-links and wrapper header files in their old location for backward compatibility. The following sections detail ROCm backward compatibility implementation for wrapper header files, executable files, library files and CMake config files.

-### Wrapper Header Files
+### Wrapper header files

 Wrapper header files are placed in the old location (
 `/opt/rocm-<ver>/<component>/include`) with a warning message to include files
@@ -88,10 +95,10 @@ from the new location (`/opt/rocm-<ver>/include`) as shown in the example below.
 #include <hip/hip_runtime.h>
 ```

- Starting at ROCm 5.2 release, the deprecation for backward compatibility wrapper header files is: `#pragma` message announcing `#warning`.
- Starting from ROCm 6.0 (tentatively) backward compatibility for wrapper header files will be removed, and the `#pragma` message will be announcing `#error`.
+* Starting at ROCm 5.2 release, the deprecation for backward compatibility wrapper header files is: `#pragma` message announcing `#warning`.
+* Starting from ROCm 6.0 (tentatively) backward compatibility for wrapper header files will be removed, and the `#pragma` message will be announcing `#error`.

-### Executable Files
+### Executable files

 Executable files are available in the `/opt/rocm-<ver>/bin` folder. For backward
 compatibility, the old library location (`/opt/rocm-<ver>/<component>/bin`) has a
@@ -103,7 +110,7 @@ $ ls -l /opt/rocm/hip/bin/
 lrwxrwxrwx 1 root root   24 Jan 1 23:32 hipcc -> ../../bin/hipcc
 ```

-### Library Files
+### Library files

 Library files are available in the `/opt/rocm-<ver>/lib` folder. For backward
 compatibility, the old library location (`/opt/rocm-<ver>/<component>/lib`) has a
@@ -116,7 +123,7 @@ drwxr-xr-x 4 root root 4096 Jan 1 10:45 cmake
 lrwxrwxrwx 1 root root   24 Jan 1 23:32 libamdhip64.so -> ../../lib/libamdhip64.so
 ```

-### CMake Config Files
+### CMake config files

 All CMake configuration files are available in the
 `/opt/rocm-<ver>/lib/cmake/<component>` folder. For backward compatibility, the
@@ -129,7 +136,7 @@ $ ls -l /opt/rocm/hip/lib/cmake/hip/
 lrwxrwxrwx 1 root root 42 Jan 1 23:32 hip-config.cmake -> ../../../../lib/cmake/hip/hip-config.cmake
 ```

-## Changes Required in Applications Using ROCm
+## Changes required in applications using ROCm

 Applications using ROCm are advised to use the new file paths. As the old files
 will be deprecated in a future release. Applications have to make sure to include
@@ -150,9 +157,9 @@ correct header file and use correct search paths.
 3. Any reference to `/opt/rocm/<component>/bin` or `/opt/rocm/<component>/lib`
   needs to be changed to `/opt/rocm/bin` and `/opt/rocm/lib/`, respectively.

-## Changes in Versioning Specifications
+## Changes in versioning specifications

-In order to better manage ROCm dependencies specification and allow smoother releases of ROCm while avoiding dependency conflicts, the ROCm platform shall adhere to the following scheme when numbering and incrementing ROCm files versions:
+In order to better manage ROCm dependencies specification and allow smoother releases of ROCm while avoiding dependency conflicts, ROCm software shall adhere to the following scheme when numbering and incrementing ROCm files versions:

 rocm-\<ver\>, where \<ver\> = \<x.y.z\>

--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -0,0 +1,58 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU architecture">
+  <meta name="keywords" content="GPU architecture, architecture support, MI200, MI250, RDNA,
+  MI100, AMD Instinct">
+</head>
+
+# GPU architecture documentation
+
+:::::{grid} 1 1 2 2
+:gutter: 1
+
+:::{grid-item-card}
+**AMD Instinct MI200 series**
+
+Review hardware aspects of the AMD Instinct™ MI200 series of GPU
+accelerators and the CDNA™ 2 architecture.
+
+* [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
+* [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
+* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
+* [Performance counters](./gpu-arch/mi200-performance-counters.md)
+
+:::
+
+:::{grid-item-card}
+**AMD Instinct MI100**
+
+Review hardware aspects of the AMD Instinct™ MI100
+accelerators and the CDNA™ 1 architecture that is the foundation of these GPUs.
+
+* [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
+* [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
+* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
+
+:::
+
+:::{grid-item-card}
+**RDNA**
+
+* [AMD RDNA3 ISA](https://www.amd.com/system/files/TechDocs/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf)
+* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
+* [AMD RDNA ISA](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf)
+* [AMD RDNA Architecture White Paper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf)
+
+:::
+
+:::{grid-item-card}
+**Older architectures**
+
+* [AMD Instinct MI50/Vega 7nm ISA](https://www.amd.com/system/files/TechDocs/vega-7nm-shader-instruction-set-architecture.pdf)
+* [AMD Instinct MI25/Vega ISA](https://www.amd.com/system/files/TechDocs/vega-shader-instruction-set-architecture.pdf)
+* [AMD GCN3 ISA](https://www.amd.com/system/files/TechDocs/gcn3-instruction-set-architecture.pdf)
+* [AMD Vega Architecture White Paper](https://en.wikichip.org/w/images/a/a1/vega-whitepaper.pdf)
+
+:::
+
+:::::
--- a/docs/conceptual/gpu-arch/mi100.md
+++ b/docs/conceptual/gpu-arch/mi100.md
@@ -1,12 +1,12 @@
-# AMD Instinct™ MI100 Hardware
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD Instinct MI100 microarchitecture">
+  <meta name="keywords" content="Instinct, MI100, microarchitecture, AMD, ROCm">
+</head>

-In this chapter, we are going to briefly review hardware aspects of the AMD
-Instinct™ MI100 accelerators and the CDNA architecture that is the foundation of
-these GPUs.
+# AMD Instinct™ MI100 microarchitecture

-## System Architecture
-
-{numref}`mi100-arch` shows the node-level architecture of a system that
+The following image shows the node-level architecture of a system that
 comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ accelerators.
 The two EPYC processors are connected to each other with the AMD Infinity™
 fabric which provides a high-bandwidth (up to 18 GT/sec) and coherent links such
@@ -17,12 +17,7 @@ available to connect the processors plus one PCIe Gen 4 x16 link per processor
 can attach additional I/O devices such as the host adapters for the network
 fabric.

-:::{figure-md} mi100-arch
-
-<img src="../../data/reference/gpu_arch/image.004.png" alt="Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ accelerators.">
-
-Structure of a single GCD in the AMD Instinct MI250 accelerator.
-:::
+![Structure of a single GCD in the AMD Instinct MI100 accelerator](../../data/conceptual/gpu-arch/image004.png "Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ accelerators.")

 In a typical node configuration, each processor can host up to four AMD
 Instinct™ accelerators that are attached using PCIe Gen 4 links at 16 GT/sec,
@@ -34,34 +29,29 @@ links. This inter-GPU link can be established in certified server systems if the
 GPUs are mounted in neighboring PCIe slots by installing the AMD Infinity
 Fabric™ bridge for the AMD Instinct™ accelerators.

-## Micro-architecture
+## Microarchitecture

-The micro-architecture of the AMD Instinct accelerators is based on the AMD CDNA
+The microarchitecture of the AMD Instinct accelerators is based on the AMD CDNA
 architecture, which targets compute applications such as high-performance
 computing (HPC) and AI & machine learning (ML) that run on everything from
 individual servers to the world's largest exascale supercomputers. The overall
 system architecture is designed for extreme scalability and compute performance.

-:::{figure-md} mi100-block
+![Structure of the AMD Instinct accelerator (MI100 generation)](../../data/conceptual/gpu-arch/image005.png "Structure of the AMD Instinct accelerator (MI100 generation)")

-<img src="../../data/reference/gpu_arch/image.005.png" alt="Structure of the AMD Instinct accelerator (MI100 generation).">
-
-Structure of the AMD Instinct accelerator (MI100 generation).
-:::
-
-{numref}`mi100-block` shows the AMD Instinct accelerator with its PCIe Gen 4 x16
+The above image shows the AMD Instinct accelerator with its PCIe Gen 4 x16
 link (16 GT/sec, at the bottom) that connects the GPU to (one of) the host
 processor(s). It also shows the three AMD Infinity Fabric ports that provide
 high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local
-hive as shown in {numref}`mi100-arch`.
+hive.

 On the left and right of the floor plan, the High Bandwidth Memory (HBM)
-attaches via the GPU's memory controller.  The MI100 generation of the AMD
+attaches via the GPU memory controller.  The MI100 generation of the AMD
 Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total
 of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the
 attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz.

-The execution units of the GPU are depicted in {numref}`mi100-block` as Compute
+The execution units of the GPU are depicted in the above image as Compute
 Units (CU). There are a total 120 compute units that are physically organized
 into eight Shader Engines (SE) with fifteen compute units per shader engine.
 Each compute unit is further sub-divided into four SIMD units that process SIMD
@@ -70,15 +60,9 @@ instructions of 16 data elements per instruction. This enables the CU to process
 Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS
 (`4 [SIMD units] x 16 [elements per instruction] x 120 [CU] x 1.5 [GHz]`).

-:::{figure-md} mi100-gcd
+![Block diagram of an MI100 compute unit with detailed SIMD view of the AMD CDNA architecture](../../data/conceptual/gpu-arch/image006.png "An MI100 compute unit with detailed SIMD view of the AMD CDNA architecture")

-<img src="../../data/reference/gpu_arch/image.006.png" alt="Block diagram of an MI100 compute unit with detailed SIMD view of the AMD CDNA architecture">
-
-Block diagram of an MI100 compute unit with detailed SIMD view of the AMD CDNA
-architecture
-:::
-
-{numref}`mi100-gcd` shows the block diagram of a single CU of an AMD Instinct™
+The preceding image shows the block diagram of a single CU of an AMD Instinct™
 MI100 accelerator and summarizes how instructions flow through the execution
 engines. The CU fetches the instructions via a 32KB instruction cache and moves
 them forward to execution via a dispatcher. The CU can handle up to ten
--- a/docs/conceptual/gpu-arch/mi200-performance-counters.md
+++ b/docs/conceptual/gpu-arch/mi200-performance-counters.md
@@ -0,0 +1,578 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="MI200 performance counters and metrics">
+  <meta name="keywords" content="MI200, performance counters, counters, GRBM counters, GRBM,
+  CPF counters, CPF, CPC counters, CPC, command processor counters, SPI counters, SPI, AMD, ROCm">
+</head>
+
+# MI200 performance counters and metrics
+<!-- markdownlint-disable no-duplicate-header -->
+
+This document lists and describes the hardware performance counters and derived metrics available on the AMD Instinct™ MI200 GPU. All the hardware basic counters and derived metrics are accessible via {doc}`ROCProfiler tool <rocprofiler:rocprofv1>`.
+
+## MI200 performance counters list
+
+See the category-wise listing of MI200 performance counters in the following tables.
+
+:::{note}
+Preliminary validation of all MI200 performance counters is in progress. Those with “*” appended to the names require further evaluation.
+:::
+
+### Graphics Register Bus Management (GRBM) counters
+
+| Hardware Counter   | Unit   | Definition                                                                |
+|:--------------------|:--------|:--------------------------------------------------------------------------|
+| `GRBM_COUNT`       | Cycles | Number of free-running GPU cycles                                         |
+| `GRBM_GUI_ACTIVE`  | Cycles | Number of GPU active cycles                                               |
+| `GRBM_CP_BUSY`     | Cycles | Number of cycles any of the Command Processor (CP) blocks are busy                  |
+| `GRBM_SPI_BUSY`    | Cycles | Number of cycles any of the Shader Processor Input (SPI) are busy in the shader engine(s) |
+| `GRBM_TA_BUSY`     | Cycles | Number of cycles any of the Texture Addressing Unit (TA) are busy in the shader engine(s) |
+| `GRBM_TC_BUSY`     | Cycles | Number of cycles any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy               |
+| `GRBM_CPC_BUSY`    | Cycles | Number of cycles the Command Processor - Compute (CPC) is busy                            |
+| `GRBM_CPF_BUSY`    | Cycles | Number of cycles the Command Processor - Fetcher (CPF) is busy                            |
+| `GRBM_UTCL2_BUSY`  | Cycles | Number of cycles the Unified Translation Cache - Level 2 (UTCL2) block is busy            |
+| `GRBM_EA_BUSY`     | Cycles | Number of cycles the Efficiency Arbiter (EA) block is busy                                |
+
+### Command Processor (CP) counters
+
+The CP counters are further classified into CP-Fetcher (CPF) and CP-Compute (CPC).
+
+#### CPF counters
+
+| Hardware Counter                     | Unit   | Definition                  |
+|:--------------------------------------|:--------|:-------------------------------------------------------------|
+| `CPF_CMP_UTCL1_STALL_ON_TRANSLATION` | Cycles | Number of cycles one of the Compute UTCL1s is stalled waiting on translation |
+| `CPF_CPF_STAT_BUSY`                  | Cycles | Number of cycles CPF is busy                                                   |
+| `CPF_CPF_STAT_IDLE*`               | Cycles | Number of cycles CPF is idle                                                   |
+| `CPF_CPF_STAT_STALL`                 | Cycles | Number of cycles CPF is stalled                                                  |
+| `CPF_CPF_TCIU_BUSY`                  | Cycles | Number of cycles CPF Texture Cache Interface Unit (TCIU) interface is busy                                    |
+| `CPF_CPF_TCIU_IDLE`                  | Cycles | Number of cycles CPF TCIU interface is idle                                    |
+| `CPF_CPF_TCIU_STALL*`              | Cycles | Number of cycles CPF TCIU interface is stalled waiting on free tags        |
+
+#### CPC counters
+
+| Hardware Counter                 | Unit   | Definition                                          |
+|:---------------------------------|:-------|:---------------------------------------------------|
+| `CPC_ME1_BUSY_FOR_PACKET_DECODE` | Cycles | Number of cycles CPC Micro Engine (ME1) is busy decoding packets                       |
+| `CPC_UTCL1_STALL_ON_TRANSLATION` | Cycles | Number of cycles one of the UTCL1s is stalled waiting on translation |
+| `CPC_CPC_STAT_BUSY`              | Cycles | Number of cycles CPC is busy                                            |
+| `CPC_CPC_STAT_IDLE`              | Cycles | Number of cycles CPC is idle                                            |
+| `CPC_CPC_STAT_STALL`             | Cycles | Number of cycles CPC is stalled                                         |
+| `CPC_CPC_TCIU_BUSY`              | Cycles | Number of cycles CPC TCIU interface is busy                             |
+| `CPC_CPC_TCIU_IDLE`              | Cycles | Number of cycles CPC TCIU interface is idle                             |
+| `CPC_CPC_UTCL2IU_BUSY`           | Cycles | Number of cycles CPC UTCL2 interface is busy                            |
+| `CPC_CPC_UTCL2IU_IDLE`           | Cycles | Number of cycles CPC UTCL2 interface is idle                            |
+| `CPC_CPC_UTCL2IU_STALL`          | Cycles | Number of cycles CPC UTCL2 interface is stalled                 |
+| `CPC_ME1_DC0_SPI_BUSY`           | Cycles | Number of cycles CPC ME1 Processor is busy                              |
+
+### Shader Processor Input (SPI) counters
+
+| Hardware Counter             | Unit        | Definition                                                   |
+|:----------------------------|:-----------|:-----------------------------------------------------------|
+| `SPI_CSN_BUSY`                 | Cycles      | Number of cycles with outstanding waves                      |
+| `SPI_CSN_WINDOW_VALID`         | Cycles      | Number of cycles enabled by `perfcounter_start` event               |
+| `SPI_CSN_NUM_THREADGROUPS`     | Workgroups  | Number of dispatched workgroups                        |
+| `SPI_CSN_WAVE`                 | Wavefronts  | Number of dispatched wavefronts                        |
+| `SPI_RA_REQ_NO_ALLOC`          | Cycles      | Number of Arb cycles with requests but no allocation |
+|`SPI_RA_REQ_NO_ALLOC_CSN`       | Cycles      | Number of Arb cycles with Compute Shader, n-th pipe (CSn) requests but no CSn allocation |
+| `SPI_RA_RES_STALL_CSN`         | Cycles      | Number of Arb stall cycles due to shortage of CSn pipeline slots |
+| `SPI_RA_TMP_STALL_CSN*`      | Cycles      | Number of stall cycles due to shortage of temp space |
+| `SPI_RA_WAVE_SIMD_FULL_CSN`    | SIMD-cycles | Accumulated number of Single Instruction Multiple Data (SIMDs) per cycle affected by shortage of wave slots for CSn wave dispatch   |
+| `SPI_RA_VGPR_SIMD_FULL_CSN*` | SIMD-cycles | Accumulated number of SIMDs per cycle affected by shortage of VGPR slots for CSn wave dispatch  |
+| `SPI_RA_SGPR_SIMD_FULL_CSN*` | SIMD-cycles | Accumulated number of SIMDs per cycle affected by shortage of SGPR slots for CSn wave dispatch    |
+| `SPI_RA_LDS_CU_FULL_CSN`       | CUs         | Number of Compute Units (CUs) affected by shortage of LDS space for CSn wave dispatch   |
+| `SPI_RA_BAR_CU_FULL_CSN*`    | CUs         | Number of CUs with CSn waves waiting at a BARRIER   |
+| `SPI_RA_BULKY_CU_FULL_CSN*`  | CUs         | Number of CUs with CSn waves waiting for BULKY resource     |
+| `SPI_RA_TGLIM_CU_FULL_CSN*`  | Cycles      | Number of CSn wave stall cycles due to restriction of `tg_limit` for thread group size    |
+| `SPI_RA_WVLIM_STALL_CSN*`  | Cycles      | Number of cycles CSn is stalled due to WAVE_LIMIT            |
+| `SPI_VWC_CSC_WR`               | Qcycles      | Number of quad-cycles taken to initialize Vector General Purpose Register (VGPRs) when launching waves |
+| `SPI_SWC_CSC_WR`               | Qcycles      | Number of quad-cycles taken to initialize Vector General Purpose Register (SGPRs) when launching waves |
+
+### Compute Unit (CU) counters
+
+The CU counters are further classified into instruction mix, Matrix Fused Multiply Add (MFMA) operation counters, level counters, wavefront counters, wavefront cycle counters and Local Data Share (LDS) counters.
+
+#### Instruction mix
+
+| Hardware Counter        | Unit   | Definition                                                               |
+|:-----------------------|:-----|:-----------------------------------------------------------------------|
+| `SQ_INSTS`                | Instr | Number of instructions issued.                                              |
+| `SQ_INSTS_VALU`           | Instr | Number of Vector Arithmetic Logic Unit (VALU) instructions including MFMA issued.                         |
+| `SQ_INSTS_VALU_ADD_F16`   | Instr | Number of VALU Half Precision Floating Point (F16) ADD/SUB instructions issued.                            |
+| `SQ_INSTS_VALU_MUL_F16`   | Instr | Number of VALU F16 Multiply instructions issued.                   |
+| `SQ_INSTS_VALU_FMA_F16`   | Instr | Number of VALU F16 Fused Multiply Add (FMA)/ Multiply Add (MAD) instructions issued.                   |
+| `SQ_INSTS_VALU_TRANS_F16` | Instr | Number of VALU F16 Transcendental instructions issued.                   |
+| `SQ_INSTS_VALU_ADD_F32`   | Instr | Number of VALU Full Precision Floating Point (F32) ADD/SUB instructions issued.                 |
+| `SQ_INSTS_VALU_MUL_F32`   | Instr | Number of VALU F32 Multiply instructions issued.                    |
+| `SQ_INSTS_VALU_FMA_F32`   | Instr | Number of VALU F32 FMA/MAD instructions issued.                   |
+| `SQ_INSTS_VALU_TRANS_F32` | Instr | Number of VALU F32 Transcendental instructions issued.                    |
+| `SQ_INSTS_VALU_ADD_F64`   | Instr | Number of VALU F64 ADD/SUB instructions issued.                |
+| `SQ_INSTS_VALU_MUL_F64`   | Instr | Number of VALU F64 Multiply instructions issued.                    |
+| `SQ_INSTS_VALU_FMA_F64`   | Instr | Number of VALU F64 FMA/MAD instructions issued.                   |
+| `SQ_INSTS_VALU_TRANS_F64` | Instr | Number of VALU F64 Transcendental instructions issued.                 |
+| `SQ_INSTS_VALU_INT32`     | Instr | Number of VALU 32-bit integer instructions (signed or unsigned) issued.        |
+| `SQ_INSTS_VALU_INT64`     | Instr | Number of VALU 64-bit integer instructions (signed or unsigned) issued.       |
+| `SQ_INSTS_VALU_CVT`       | Instr | Number of VALU Conversion instructions issued.                   |
+| `SQ_INSTS_VALU_MFMA_I8`   | Instr | Number of 8-bit Integer MFMA instructions issued.               |
+| `SQ_INSTS_VALU_MFMA_F16`  | Instr | Number of F16 MFMA instructions issued.                                   |
+| `SQ_INSTS_VALU_MFMA_BF16` | Instr | Number of Brain Floating Point - 16 (BF16) MFMA instructions issued.                                  |
+| `SQ_INSTS_VALU_MFMA_F32`  | Instr | Number of F32 MFMA instructions issued.                                    |
+| `SQ_INSTS_VALU_MFMA_F64`  | Instr | Number of F64 MFMA instructions issued.                               |
+| `SQ_INSTS_MFMA`           | Instr | Number of MFMA instructions issued.                                  |
+| `SQ_INSTS_VMEM_WR`        | Instr | Number of Vector Memory (VMEM) Write instructions (including FLAT) issued.                                  |
+| `SQ_INSTS_VMEM_RD`        | Instr | Number of VMEM Read instructions (including FLAT) issued.  |
+| `SQ_INSTS_VMEM`           | Instr | Number of VMEM instructions issued, including both FLAT and Buffer instructions. |
+| `SQ_INSTS_SALU`           | Instr | Number of SALU instructions issued.                                        |
+| `SQ_INSTS_SMEM`           | Instr | Number of Scalar Memory (SMEM) instructions issued.                                       |
+| `SQ_INSTS_SMEM_NORM`      | Instr | Number of SMEM instructions normalized to match `smem_level` issued. |
+| `SQ_INSTS_FLAT`           | Instr | Number of FLAT instructions issued.                                     |
+| `SQ_INSTS_FLAT_LDS_ONLY`  | Instr | Number of FLAT instructions that read/write only from/to LDS issued. Works only if `EARLY_TA_DONE` is enabled.       |
+| `SQ_INSTS_LDS`            | Instr | Number of Local Data Share (LDS) instructions issued (including FLAT).                                         |
+| `SQ_INSTS_GDS`            | Instr | Number of Global Data Share (GDS) instructions issued.                                         |
+| `SQ_INSTS_EXP_GDS`        | Instr | Number of EXP and GDS instructions excluding skipped export instructions issued.  |
+| `SQ_INSTS_BRANCH`         | Instr | Number of Branch instructions issued.                                     |
+| `SQ_INSTS_SENDMSG`        | Instr | Number of `SENDMSG` instructions including `s_endpgm` issued.                 |
+| `SQ_INSTS_VSKIPPED*`    | Instr | Number of vector instructions skipped.                                 |
+
+#### MFMA operation counters
+
+| Hardware Counter             | Unit  | Definition                                      |
+|:----------------------------|:-----|:----------------------------------------------|
+| `SQ_INSTS_VALU_MFMA_MOPS_I8`   | IOP   | Number of 8-bit integer MFMA ops in the unit of 512 |
+| `SQ_INSTS_VALU_MFMA_MOPS_F16`  | FLOP  | Number of F16 floating MFMA ops in the unit of 512  |
+| `SQ_INSTS_VALU_MFMA_MOPS_BF16` | FLOP  | Number of BF16 floating MFMA ops in the unit of 512 |
+| `SQ_INSTS_VALU_MFMA_MOPS_F32`  | FLOP  | Number of F32 floating MFMA ops in the unit of 512  |
+| `SQ_INSTS_VALU_MFMA_MOPS_F64`  | FLOP  | Number of F64 floating MFMA ops in the unit of 512  |
+
+#### Level counters
+
+:::{note}
+All level counters must be followed by `SQ_ACCUM_PREV_HIRES` counter to measure average latency.
+:::
+
+| Hardware Counter    | Unit  | Definition                             |
+|:-------------------|:-----|:-------------------------------------|
+| `SQ_ACCUM_PREV`       | Count | Accumulated counter sample value where accumulation takes place once every four cycles. |
+| `SQ_ACCUM_PREV_HIRES` | Count | Accumulated counter sample value where accumulation takes place once every cycle. |
+| `SQ_LEVEL_WAVES`      | Waves | Number of inflight waves. To calculate the wave latency, divide `SQ_ACCUM_PREV_HIRES` by `SQ_WAVE`.           |
+| `SQ_INST_LEVEL_VMEM` | Instr | Number of inflight VMEM (including FLAT) instructions. To calculate the VMEM latency, divide `SQ_ACCUM_PREV_HIRES` by `SQ_INSTS_VMEM`.   |
+| `SQ_INST_LEVEL_SMEM` | Instr | Number of inflight SMEM instructions. To calculate the SMEM latency, divide `SQ_ACCUM_PREV_HIRES` by `SQ_INSTS_SMEM_NORM`.    |
+| `SQ_INST_LEVEL_LDS`  | Instr | Number of inflight LDS (including FLAT) instructions. To calculate the LDS latency, divide `SQ_ACCUM_PREV_HIRES` by `SQ_INSTS_LDS`.  |
+| `SQ_IFETCH_LEVEL`     | Instr | Number of inflight instruction fetch requests from the cache. To calculate the instruction fetch latency, divide `SQ_ACCUM_PREV_HIRES` by `SQ_IFETCH`. |
+
+#### Wavefront counters
+
+| Hardware Counter     | Unit  | Definition                                                        |
+|:--------------------|:-----|:----------------------------------------------------------------|
+| `SQ_WAVES`             | Waves | Number of wavefronts dispatched to Sequencers (SQs), including both new and restored wavefronts  |
+| `SQ_WAVES_SAVED*`    | Waves | Number of context-saved waves                  |
+| `SQ_WAVES_RESTORED*` | Waves | Number of context-restored waves sent to SQs                  |
+| `SQ_WAVES_EQ_64`       | Waves | Number of wavefronts with exactly 64 active threads sent to SQs    |
+| `SQ_WAVES_LT_64`       | Waves | Number of wavefronts with less than 64 active threads sent to SQs  |
+| `SQ_WAVES_LT_48`       | Waves | Number of wavefronts with less than 48 active threads sent to SQs  |
+| `SQ_WAVES_LT_32`       | Waves | Number of wavefronts with less than 32 active threads sent to SQs  |
+| `SQ_WAVES_LT_16`       | Waves | Number of wavefronts with less than 16 active threads sent to SQs  |
+
+#### Wavefront cycle counters
+
+| Hardware Counter         | Unit    | Definition                                                            |
+|:------------------------|:-------|:--------------------------------------------------------------------|
+| `SQ_CYCLES`                | Cycles  | Clock cycles.  |
+| `SQ_BUSY_CYCLES`           | Cycles  | Number of cycles while SQ reports it to be busy.                       |
+| `SQ_BUSY_CU_CYCLES`        | Qcycles | Number of quad-cycles each CU is busy.                                  |
+| `SQ_VALU_MFMA_BUSY_CYCLES` | Cycles  | Number of cycles the MFMA ALU is busy.                                 |
+| `SQ_WAVE_CYCLES`           | Qcycles | Number of quad-cycles spent by waves in the CUs.                       |
+| `SQ_WAIT_ANY`              | Qcycles | Number of quad-cycles spent waiting for anything.                    |
+| `SQ_WAIT_INST_ANY`         | Qcycles | Number of quad-cycles spent waiting for any instruction to be issued.         |
+| `SQ_ACTIVE_INST_ANY`       | Qcycles | Number of quad-cycles spent by each wave to work on an instruction.   |
+| `SQ_ACTIVE_INST_VMEM`      | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on a VMEM instruction.  |
+| `SQ_ACTIVE_INST_LDS`       | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on an LDS instruction. |
+| `SQ_ACTIVE_INST_VALU`      | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on a VALU instruction.  |
+| `SQ_ACTIVE_INST_SCA`       | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on a SALU or SMEM instruction.  |
+| `SQ_ACTIVE_INST_EXP_GDS`   | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on an EXPORT or GDS instruction.  |
+| `SQ_ACTIVE_INST_MISC`      | Qcycles | Number of quad-cycles spent by the SQ instruction aribter to work on a BRANCH or `SENDMSG` instruction.  |
+| `SQ_ACTIVE_INST_FLAT`      | Qcycles | Number of quad-cycles spent by the SQ instruction arbiter to work on a FLAT instruction.  |
+| `SQ_INST_CYCLES_VMEM_WR`   | Qcycles | Number of quad-cycles  spent to send addr and cmd data for VMEM Write instructions.  |
+| `SQ_INST_CYCLES_VMEM_RD`   | Qcycles | Number of quad-cycles  spent to send addr and cmd data for VMEM Read instructions.  |
+| `SQ_INST_CYCLES_SMEM`      | Qcycles | Number of quad-cycles  spent to execute scalar memory reads.          |
+| `SQ_INST_CYCLES_SALU`      | Qcycles  | Number of quad-cycles spent to execute non-memory read scalar operations.    |
+| `SQ_THREAD_CYCLES_VALU`    | Cycles  | Number of thread-cycles spent to execute VALU operations. This is similar to `INST_CYCLES_VALU` but multiplied by the number of active threads.            |
+| `SQ_WAIT_INST_LDS` | Qcycles | Number of quad-cycles spent waiting for LDS instruction to be issued.  |
+
+#### LDS counters
+
+| Hardware Counter           | Unit   | Definition                                                |
+|:--------------------------|:------|:--------------------------------------------------------|
+| `SQ_LDS_ATOMIC_RETURN`       | Cycles | Number of atomic return cycles in LDS                   |
+| `SQ_LDS_BANK_CONFLICT`       | Cycles | Number of cycles LDS is stalled by bank conflicts     |
+| `SQ_LDS_ADDR_CONFLICT*`    | Cycles | Number of cycles LDS is stalled by address conflicts     |
+| `SQ_LDS_UNALIGNED_STALL*` | Cycles | Number of cycles LDS is stalled processing flat unaligned load/store ops |
+| `SQ_LDS_MEM_VIOLATIONS*`   | Count  | Number of threads that have a memory violation in the LDS  |
+| `SQ_LDS_IDX_ACTIVE` | Cycles | Number of cycles LDS is used for indexed operations  |
+
+#### Miscellaneous counters
+
+| Hardware Counter           | Unit   | Definition                                                |
+|:--------------------------|:------|:--------------------------------------------------------|
+| `SQ_IFETCH`        | Count   | Number of instruction fetch requests from `L1I` cache, in 32-byte width  |
+| `SQ_ITEMS`         | Threads | Number of valid items per wave                                  |
+
+### L1I and sL1D cache counters
+
+| Hardware Counter             | Unit   | Definition                                                        |
+|:----------------------------|:------|:----------------------------------------------------------------|
+| `SQC_ICACHE_REQ`               | Req    | Number of `L1I` cache requests                                      |
+| `SQC_ICACHE_HITS`              | Count  | Number of `L1I` cache hits                                   |
+| `SQC_ICACHE_MISSES`            | Count  | Number of non-duplicate `L1I` cache misses including uncached requests                   |
+| `SQC_ICACHE_MISSES_DUPLICATE`  | Count  | Number of duplicate `L1I` cache misses whose previous lookup miss on the same cache line is not fulfilled yet |
+| `SQC_DCACHE_REQ`               | Req    | Number of `sL1D` cache requests                                  |
+| `SQC_DCACHE_INPUT_VALID_READYB` | Cycles | Number of cycles while SQ input is valid but sL1D cache is not ready |
+| `SQC_DCACHE_HITS`              | Count  | Number of `sL1D` cache hits                                |
+| `SQC_DCACHE_MISSES`            | Count  | Number of non-duplicate `sL1D` cache misses including uncached requests                        |
+| `SQC_DCACHE_MISSES_DUPLICATE`  | Count  | Number of duplicate `sL1D` cache misses                            |
+| `SQC_DCACHE_REQ_READ_1`        | Req    | Number of constant cache read requests in a single DW  |
+| `SQC_DCACHE_REQ_READ_2`        | Req    | Number of constant cache read requests in two DW      |
+| `SQC_DCACHE_REQ_READ_4`        | Req    | Number of constant cache read requests in four DW  |
+| `SQC_DCACHE_REQ_READ_8`        | Req    | Number of constant cache read requests in eight DW     |
+| `SQC_DCACHE_REQ_READ_16`       | Req    | Number of constant cache read requests in 16 DW      |
+| `SQC_DCACHE_ATOMIC*`         | Req    | Number of atomic requests                 |
+| `SQC_TC_REQ`                   | Req    | Number of TC requests that were issued by instruction and constant caches  |
+| `SQC_TC_INST_REQ`              | Req    | Number of instruction requests to the L2 cache            |
+| `SQC_TC_DATA_READ_REQ`         | Req    | Number of data Read requests to the L2 cache                   |
+| `SQC_TC_DATA_WRITE_REQ*`     | Req    | Number of data write requests to the L2 cache                    |
+| `SQC_TC_DATA_ATOMIC_REQ*`    | Req    | Number of data atomic requests to the L2 cache              |
+| `SQC_TC_STALL*`              | Cycles | Number of cycles while the valid requests to the L2 cache are stalled |
+
+### Vector L1 cache subsystem
+
+The vector L1 cache subsystem counters are further classified into Texture Addressing Unit (TA), Texture Data Unit (TD), vector L1D cache or Texture Cache per Pipe (TCP), and Texture Cache Arbiter (TCA) counters.
+
+#### TA counters
+
+| Hardware Counter                 | Unit   | Definition                                        |
+|:--------------------------------|:------|:------------------------------------------------|
+| `TA_TA_BUSY[n]`                       | Cycles | TA busy cycles. Value range for n: [0-15]. |
+| `TA_TOTAL_WAVEFRONTS[n]`              | Instr  | Number of wavefronts processed by TA. Value range for n: [0-15].       |
+| `TA_BUFFER_WAVEFRONTS[n]`             | Instr  | Number of buffer wavefronts processed by TA. Value range for n: [0-15].       |
+| `TA_BUFFER_READ_WAVEFRONTS[n]`        | Instr  | Number of buffer read wavefronts processed by TA. Value range for n: [0-15].  |
+| `TA_BUFFER_WRITE_WAVEFRONTS[n]`       | Instr  | Number of buffer write wavefronts processed by TA. Value range for n: [0-15]. |
+| `TA_BUFFER_ATOMIC_WAVEFRONTS[n]`   | Instr  | Number of buffer atomic wavefronts processed by TA. Value range for n: [0-15]. |
+| `TA_BUFFER_TOTAL_CYCLES[n]`           | Cycles | Number of buffer cycles (including read and write) issued to TC. Value range for n: [0-15].  |
+| `TA_BUFFER_COALESCED_READ_CYCLES[n]`  | Cycles | Number of coalesced buffer read cycles issued to TC. Value range for n: [0-15].         |
+| `TA_BUFFER_COALESCED_WRITE_CYCLES[n]` | Cycles | Number of coalesced buffer write cycles issued to TC. Value range for n: [0-15].         |
+| `TA_ADDR_STALLED_BY_TC_CYCLES[n]`     | Cycles | Number of cycles TA address path is stalled by TC. Value range for n: [0-15]. |
+| `TA_DATA_STALLED_BY_TC_CYCLES[n]`            | Cycles | Number of cycles TA data path is stalled by TC. Value range for n: [0-15].       |
+| `TA_ADDR_STALLED_BY_TD_CYCLES[n]`  | Cycles | Number of cycles TA address path is stalled by TD. Value range for n: [0-15].     |
+| `TA_FLAT_WAVEFRONTS[n]`               | Instr  | Number of flat opcode wavefronts processed by TA. Value range for n: [0-15].            |
+| `TA_FLAT_READ_WAVEFRONTS[n]`          | Instr  | Number of flat opcode read wavefronts processed by TA. Value range for n: [0-15].        |
+| `TA_FLAT_WRITE_WAVEFRONTS[n]`         | Instr  | Number of flat opcode write wavefronts processed by TA. Value range for n: [0-15].      |
+| `TA_FLAT_ATOMIC_WAVEFRONTS[n]`        | Instr  | Number of flat opcode atomic wavefronts processed by TA. Value range for n: [0-15].      |
+
+#### TD counters
+
+| Hardware Counter         | Unit  | Definition                                           |
+|:------------------------|:-----|:---------------------------------------------------|
+| `TD_TD_BUSY[n]`               | Cycle | TD busy cycles while it is processing or waiting for data. Value range for n: [0-15].                            |
+| `TD_TC_STALL[n]`              | Cycle | Number of cycles TD is stalled waiting for TC data. Value range for n: [0-15].   |
+| `TD_SPI_STALL[n]`          | Cycle | Number of cycles TD is stalled by SPI. Value range for n: [0-15].      |
+| `TD_LOAD_WAVEFRONT[n]`        | Instr |Number of wavefront instructions (read/write/atomic). Value range for n: [0-15]. |
+| `TD_STORE_WAVEFRONT[n]`       | Instr | Number of write wavefront instructions. Value range for n: [0-15].|
+| `TD_ATOMIC_WAVEFRONT[n]`      | Instr | Number of atomic wavefront instructions. Value range for n: [0-15]. |
+| `TD_COALESCABLE_WAVEFRONT[n]` | Instr | Number of coalescable wavefronts according to TA. Value range for n: [0-15].     |
+
+#### TCP counters
+
+| Hardware Counter                    | Unit   | Definition                                                  |
+|:-----------------------------------|:------|:----------------------------------------------------------|
+| `TCP_GATE_EN1[n]`                        | Cycles | Number of cycles vL1D interface clocks are turned on. Value range for n: [0-15].    |
+| `TCP_GATE_EN2[n]`                        | Cycles | Number of cycles vL1D core clocks are turned on. Value range for n: [0-15].  |
+| `TCP_TD_TCP_STALL_CYCLES[n]`             | Cycles | Number of cycles TD stalls vL1D. Value range for n: [0-15].                           |
+| `TCP_TCR_TCP_STALL_CYCLES[n]`            | Cycles | Number of cycles TCR stalls vL1D. Value range for n: [0-15].                           |
+| `TCP_READ_TAGCONFLICT_STALL_CYCLES[n]`   | Cycles | Number of cycles tagram conflict stalls on a read. Value range for n: [0-15].          |
+| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]`  | Cycles | Number of cycles tagram conflict stalls on a write. Value range for n: [0-15].         |
+| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tagram conflict stalls on an atomic. Value range for n: [0-15].       |
+| `TCP_PENDING_STALL_CYCLES[n]`            | Cycles | Number of cycles vL1D cache is stalled due to data pending from L2 Cache. Value range for n: [0-15]. |
+| `TCP_TCP_TA_DATA_STALL_CYCLES` | Cycles | Number of cycles TCP stalls TA data interface. |
+| `TCP_TA_TCP_STATE_READ[n]`               | Req    | Number of state reads. Value range for n: [0-15].    |
+| `TCP_VOLATILE[n]`                     | Req    | Number of L1 volatile pixels/buffers from TA. Value range for n: [0-15].  |
+| `TCP_TOTAL_ACCESSES[n]`                  | Req    | Number of vL1D accesses. Equals `TCP_PERF_SEL_TOTAL_READ`+`TCP_PERF_SEL_TOTAL_NONREAD`. Value range for n: [0-15].                    |
+| `TCP_TOTAL_READ[n]`                      | Req    | Number of vL1D read accesses. Equals `TCP_PERF_SEL_TOTAL_HIT_LRU_READ` + `TCP_PERF_SEL_TOTAL_MISS_LRU_READ` + `TCP_PERF_SEL_TOTAL_MISS_EVICT_READ`. Value range for n: [0-15].    |
+| `TCP_TOTAL_WRITE[n]`                     | Req    | Number of vL1D write accesses. `Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE`+ `TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE`. Value range for n: [0-15].     |
+| `TCP_TOTAL_ATOMIC_WITH_RET[n]`           | Req    | Number of vL1D atomic requests with return. Value range for n: [0-15].       |
+| `TCP_TOTAL_ATOMIC_WITHOUT_RET[n]`        | Req    | Number of vL1D atomic without return. Value range for n: [0-15].        |
+| `TCP_TOTAL_WRITEBACK_INVALIDATES[n]`     | Count  | Total number of vL1D writebacks and invalidates. Equals `TCP_PERF_SEL_TOTAL_WBINVL1`+ `TCP_PERF_SEL_TOTAL_WBINVL1_VOL`+ `TCP_PERF_SEL_CP_TCP_INVALIDATE`+ `TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL`. Value range for n: [0-15].       |
+| `TCP_UTCL1_REQUEST[n]`                   | Req    | Number of address translation requests to UTCL1. Value range for n: [0-15].            |
+| `TCP_UTCL1_TRANSLATION_HIT[n]`           | Req    | Number of UTCL1 translation hits. Value range for n: [0-15].     |
+| `TCP_UTCL1_TRANSLATION_MISS[n]`          | Req    | Number of UTCL1 translation misses. Value range for n: [0-15].    |
+| `TCP_UTCL1_PERMISSION_MISS[n]`          | Req    | Number of UTCL1 permission misses. Value range for n: [0-15].       |
+| `TCP_TOTAL_CACHE_ACCESSES[n]`            | Req    | Number of vL1D cache accesses including hits and misses. Value range for n: [0-15].     |
+| `TCP_TCP_LATENCY[n]`                     | Cycles | Accumulated wave access latency to vL1D over all wavefronts. Value range for n: [0-15]. |
+| `TCP_TCC_READ_REQ_LATENCY[n]`            | Cycles | Total vL1D to L2 request latency over all wavefronts for reads and atomics with return. Value range for n: [0-15]. |
+| `TCP_TCC_WRITE_REQ_LATENCY[n]`           | Cycles | Total vL1D to L2 request latency over all wavefronts for writes and atomics without return. Value range for n: [0-15]. |
+| `TCP_TCC_READ_REQ[n]`                    | Req    | Number of read requests to L2 cache. Value range for n: [0-15].      |
+| `TCP_TCC_WRITE_REQ[n]`                   | Req    | Number of write requests to L2 cache. Value range for n: [0-15].                   |
+| `TCP_TCC_ATOMIC_WITH_RET_REQ[n]`         | Req    | Number of atomic requests to L2 cache with return. Value range for n: [0-15].       |
+| `TCP_TCC_ATOMIC_WITHOUT_RET_REQ[n]`      | Req    | Number of atomic requests to L2 cache without return. Value range for n: [0-15].    |
+| `TCP_TCC_NC_READ_REQ[n]`                 | Req    | Number of NC read requests to L2 cache. Value range for n: [0-15].       |
+| `TCP_TCC_UC_READ_REQ[n]`                 | Req    | Number of UC read requests to L2 cache. Value range for n: [0-15].          |
+| `TCP_TCC_CC_READ_REQ[n]`                 | Req    | Number of CC read requests to L2 cache. Value range for n: [0-15].     |
+| `TCP_TCC_RW_READ_REQ[n]`                 | Req    | Number of RW read requests to L2 cache. Value range for n: [0-15].       |
+| `TCP_TCC_NC_WRITE_REQ[n]`                | Req    | Number of NC write requests to L2 cache. Value range for n: [0-15].         |
+| `TCP_TCC_UC_WRITE_REQ[n]`                | Req    | Number of UC write requests to L2 cache. Value range for n: [0-15].         |
+| `TCP_TCC_CC_WRITE_REQ[n]`                | Req    | Number of CC write requests to L2 cache. Value range for n: [0-15].         |
+| `TCP_TCC_RW_WRITE_REQ[n]`                | Req    | Number of RW write requests to L2 cache. Value range for n: [0-15].         |
+| `TCP_TCC_NC_ATOMIC_REQ[n]`               | Req    | Number of NC atomic requests to L2 cache. Value range for n: [0-15].        |
+| `TCP_TCC_UC_ATOMIC_REQ[n]`               | Req    | Number of UC atomic requests to L2 cache. Value range for n: [0-15].      |
+| `TCP_TCC_CC_ATOMIC_REQ[n]`               | Req    | Number of CC atomic requests to L2 cache. Value range for n: [0-15].      |
+| `TCP_TCC_RW_ATOMIC_REQ[n]`               | Req    | Number of RW atomic requests to L2 cache. Value range for n: [0-15].       |
+
+#### TCA counters
+
+| Hardware Counter | Unit   | Definition                                  |
+|:----------------|:------|:------------------------------------------|
+| `TCA_CYCLE[n]`        | Cycles | Number of TCA cycles. Value range for n: [0-31].                               |
+| `TCA_BUSY[n]`         | Cycles | Number of cycles TCA has a pending request. Value range for n: [0-31]. |
+
+### L2 cache access counters
+
+L2 Cache is also known as Texture Cache per Channel (TCC).
+
+| Hardware Counter                 | Unit   | Definition                                                     |
+|:--------------------------------|:------|:-------------------------------------------------------------|
+| `TCC_CYCLE[n]`                        |Cycle   | Number of L2 cache free-running clocks. Value range for n: [0-31].               |
+| `TCC_BUSY[n]`                         |Cycle   | Number of L2 cache busy cycles. Value range for n: [0-31].                                        |
+| `TCC_REQ[n]`                          |Req     | Number of L2 cache requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed. Value range for n: [0-31].      |
+| `TCC_STREAMING_REQ[n]`             |Req     | Number of L2 cache streaming requests. This is measured at the tag block. Value range for n: [0-31]. |
+| `TCC_NC_REQ[n]`                       |Req     | Number of NC requests. This is measured at the tag block. Value range for n: [0-31].   |
+| `TCC_UC_REQ[n]`                       |Req     | Number of UC requests. This is measured at the tag block. Value range for n: [0-31].   |
+| `TCC_CC_REQ[n]`                       |Req     | Number of CC requests. This is measured at the tag block. Value range for n: [0-31].   |
+| `TCC_RW_REQ[n]`                       |Req     | Number of RW requests. This is measured at the tag block. Value range for n: [0-31].   |
+| `TCC_PROBE[n]`                        |Req     | Number of probe requests. Value range for n: [0-31].  |
+| `TCC_PROBE_ALL[n]`                 |Req     | Number of external probe requests with `EA_TCC_preq_all`== 1. Value range for n: [0-31].    |
+| `TCC_READ[n]`                     |Req     | Number of L2 cache read requests. This includes compressed reads but not metadata reads. Value range for n: [0-31].   |
+| `TCC_WRITE[n]`                    |Req     | Number of L2 cache write requests. Value range for n: [0-31].     |
+| `TCC_ATOMIC[n]`                   |Req     | Number of L2 cache atomic requests of all types. Value range for n: [0-31]. |
+| `TCC_HIT[n]`                          |Req     | Number of L2 cache hits. Value range for n: [0-31].      |
+| `TCC_MISS[n]`                         |Req     | Number of L2 cache misses. Value range for n: [0-31].        |
+| `TCC_WRITEBACK[n]`                    |Req     | Number of lines written back to the main memory, including writebacks of dirty lines and uncached write/atomic requests. Value range for n: [0-31]. |
+| `TCC_EA_WRREQ[n]`                     |Req     | Number of 32-byte and 64-byte transactions going over the `TC_EA_wrreq` interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. Value range for n: [0-31].   |
+| `TCC_EA_WRREQ_64B[n]`                 |Req     | Total number of 64-byte transactions (write or `CMPSWAP`) going over the `TC_EA_wrreq` interface. Value range for n: [0-31].  |
+| `TCC_EA_WR_UNCACHED_32B[n]`           |Req     | Number of 32-byte write/atomic going over the `TC_EA_wrreq` interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request is counted as 2. Value range for n: [0-31].|
+| `TCC_EA_WRREQ_STALL[n]`               | Cycles | Number of cycles a write request is stalled. Value range for n: [0-31].                 |
+| `TCC_EA_WRREQ_IO_CREDIT_STALL[n]`  | Cycles | Number of cycles an EA write request is stalled due to the interface running out of IO credits. Value range for n: [0-31].  |
+| `TCC_EA_WRREQ_GMI_CREDIT_STALL[n]` | Cycles | Number of cycles an EA write request is stalled due to the interface running out of GMI credits. Value range for n: [0-31].  |
+| `TCC_EA_WRREQ_DRAM_CREDIT_STALL[n]`   | Cycles | Number of cycles an EA write request is stalled due to the interface running out of DRAM credits. Value range for n: [0-31]. |
+| `TCC_TOO_MANY_EA_WRREQS_STALL[n]`  | Cycles | Number of cycles the L2 cache is unable to send an EA write request due to it reaching its maximum capacity of pending EA write requests. Value range for n: [0-31]. |
+| `TCC_EA_WRREQ_LEVEL[n]`               | Req    | The accumulated number of EA write requests in flight. This is primarily intended to measure average EA write latency. Average write latency = `TCC_PERF_SEL_EA_WRREQ_LEVEL`/`TCC_PERF_SEL_EA_WRREQ`. Value range for n: [0-31].  |
+| `TCC_EA_ATOMIC[n]`                    | Req    | Number of 32-byte or 64-byte atomic requests going over the `TC_EA_wrreq` interface. Value range for n: [0-31].           |
+| `TCC_EA_ATOMIC_LEVEL[n]`              | Req    | The accumulated number of EA atomic requests in flight. This is primarily intended to measure average EA atomic latency. Average atomic latency = `TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL`/`TCC_PERF_SEL_EA_WRREQ_ATOMIC`. Value range for n: [0-31].  |
+| `TCC_EA_RDREQ[n]`                     | Req    | Number of 32-byte or 64-byte read requests to EA. Value range for n: [0-31].   |
+| `TCC_EA_RDREQ_32B[n]`                 | Req    | Number of 32-byte read requests to EA. Value range for n: [0-31].  |
+| `TCC_EA_RD_UNCACHED_32B[n]`           | Req    | Number of 32-byte EA reads due to uncached traffic. A 64-byte request is counted as 2. Value range for n: [0-31]. |
+| `TCC_EA_RDREQ_IO_CREDIT_STALL[n]`  | Cycles | Number of cycles there is a stall due to the read request interface running out of IO credits. Stalls occur irrespective of the need for a read to be performed. Value range for n: [0-31]. |
+| `TCC_EA_RDREQ_GMI_CREDIT_STALL[n]` | Cycles | Number of cycles there is a stall due to the read request interface running out of GMI credits. Stalls occur irrespective of the need for a read to be performed. Value range for n: [0-31]. |
+| `TCC_EA_RDREQ_DRAM_CREDIT_STALL[n]`   | Cycles | Number of cycles there is a stall due to the read request interface running out of DRAM credits. Stalls occur irrespective of the need for a read to be performed. Value range for n: [0-31]. |
+| `TCC_EA_RDREQ_LEVEL[n]`               | Req    | The accumulated number of EA read requests in flight. This is primarily intended to measure average EA read latency. Average read latency = `TCC_PERF_SEL_EA_RDREQ_LEVEL`/`TCC_PERF_SEL_EA_RDREQ`. Value range for n: [0-31].    |
+| `TCC_EA_RDREQ_DRAM[n]`                | Req    | Number of 32-byte or 64-byte EA read requests to High Bandwidth Memory (HBM). Value range for n: [0-31].   |
+| `TCC_EA_WRREQ_DRAM[n]`                | Req    | Number of 32-byte or 64-byte EA write requests to HBM. Value range for n: [0-31].  |
+| `TCC_TAG_STALL[n]`                    | Cycles | Number of cycles the normal request pipeline in the tag is stalled for any reason.  Normally, stalls of this nature are measured exactly at one point in the pipeline however in case of this counter, probes can stall the pipeline at a variety of places and there is no single point that can reasonably measure the total stalls accurately. Value range for n: [0-31]. |
+| `TCC_NORMAL_WRITEBACK[n]`             | Req    | Number of writebacks due to requests that are not writeback requests. Value range for n: [0-31].    |
+| `TCC_ALL_TC_OP_WB_WRITEBACK[n]`    | Req    | Number of writebacks due to all `TC_OP` writeback requests. Value range for n: [0-31].       |
+| `TCC_NORMAL_EVICT[n]`                 | Req    | Number of evictions due to requests that are not invalidate or probe requests. Value range for n: [0-31].        |
+| `TCC_ALL_TC_OP_INV_EVICT[n]`       | Req    | Number of evictions due to all `TC_OP` invalidate requests. Value range for n: [0-31].           |
+
+## MI200 derived metrics list
+
+| Derived Metric   | Description                                                                            |
+|:----------------|:-------------------------------------------------------------------------------------|
+| `ALUStalledByLDS` | Percentage of GPU time ALU units are stalled due to the LDS input queue being full or the output queue not being ready. Reduce this by reducing the LDS bank conflicts or the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). |
+| `FetchSize` | Total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. |
+| `FlatLDSInsts`     | Average number of FLAT instructions that read from or write to LDS, executed per work item (affected by flow control). |
+| `FlatVMemInsts`    | Average number of FLAT instructions that read from or write to the video memory, executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. |
+| `GDSInsts` | Average number of GDS read/write instructions executed per work item (affected by flow control). |
+| `GPUBusy` | Percentage of time GPU is busy. |
+| `L2CacheHit`       | Percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). |
+| `LDSBankConflict`  | Percentage of GPU time LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). |
+| `LDSInsts`         | Average number of LDS read/write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. |
+| `MemUnitBusy` | Percentage of GPU time the memory unit is active. The result includes the stall time (`MemUnitStalled`). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). |
+| `MemUnitStalled`   | Percentage of GPU time the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). |
+| `MemWrites32B`     | Total number of effective 32B write transactions to the memory.                      |
+| `SALUBusy`         | Percentage of GPU time scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). |
+| `SALUInsts` | Average number of scalar ALU instructions executed per work item (affected by flow control). |
+| `SFetchInsts` | Average number of scalar fetch instructions from the video memory executed per work item (affected by flow control). |
+| `TA_ADDR_STALLED_BY_TC_CYCLES_sum` | Total number of cycles TA address path is stalled by TC, over all TA instances. |
+| `TA_ADDR_STALLED_BY_TD_CYCLES_sum` | Total number of cycles TA address path is stalled by TD, over all TA instances. |
+| `TA_BUFFER_WAVEFRONTS_sum` | Total number of buffer wavefronts processed by all TA instances. |
+| `TA_BUFFER_READ_WAVEFRONTS_sum` | Total number of buffer read wavefronts processed by all TA instances. |
+| `TA_BUFFER_WRITE_WAVEFRONTS_sum` | Total number of buffer write wavefronts processed by all TA instances. |
+| `TA_BUFFER_ATOMIC_WAVEFRONTS_sum` | Total number of buffer atomic wavefronts processed by all TA instances. |
+| `TA_BUFFER_TOTAL_CYCLES_sum` | Total number of buffer cycles (including read and write) issued to TC by all TA instances. |
+| `TA_BUFFER_COALESCED_READ_CYCLES_sum` | Total number of coalesced buffer read cycles issued to TC by all TA instances. |
+| `TA_BUFFER_COALESCED_WRITE_CYCLES_sum` | Total number of coalesced buffer write cycles issued to TC by all TA instances. |
+| `TA_BUSY_avr` | Average number of busy cycles over all TA instances. |
+| `TA_BUSY_max` | Maximum number of TA busy cycles over all TA instances. |
+| `TA_BUSY_min` | Minimum number of TA busy cycles over all TA instances. |
+| `TA_DATA_STALLED_BY_TC_CYCLES_sum` | Total number of cycles TA data path is stalled by TC, over all TA instances. |
+| `TA_FLAT_READ_WAVEFRONTS_sum` | Sum of flat opcode reads processed by all TA instances. |
+| `TA_FLAT_WRITE_WAVEFRONTS_sum` | Sum of flat opcode writes processed by all TA instances. |
+| `TA_FLAT_WAVEFRONTS_sum` | Total number of flat opcode wavefronts processed by all TA instances. |
+| `TA_FLAT_READ_WAVEFRONTS_sum` | Total number of flat opcode read wavefronts processed by all TA instances. |
+| `TA_FLAT_ATOMIC_WAVEFRONTS_sum` | Total number of flat opcode atomic wavefronts processed by all TA instances. |
+| `TA_TA_BUSY_sum` | Total number of TA busy cycles over all TA instances. |
+| `TA_TOTAL_WAVEFRONTS_sum` | Total number of wavefronts processed by all TA instances. |
+| `TCA_BUSY_sum` | Total number of cycles TCA has a pending request, over all TCA instances. |
+| `TCA_CYCLE_sum` | Total number of cycles over all TCA instances. |
+| `TCC_ALL_TC_OP_WB_WRITEBACK_sum` | Total number of writebacks due to all TC_OP writeback requests, over all TCC instances. |
+| `TCC_ALL_TC_OP_INV_EVICT_sum` | Total number of evictions due to all TC_OP invalidate requests, over all TCC instances. |
+| `TCC_ATOMIC_sum` | Total number of L2 cache atomic requests of all types, over all TCC instances. |
+| `TCC_BUSY_avr` | Average number of L2 cache busy cycles, over all TCC instances. |
+| `TCC_BUSY_sum` | Total number of L2 cache busy cycles, over all TCC instances. |
+| `TCC_CC_REQ_sum` | Total number of CC requests over all TCC instances. |
+| `TCC_CYCLE_sum` | Total number of L2 cache free running clocks, over all TCC instances. |
+| `TCC_EA_WRREQ_sum` | Total number of 32-byte and 64-byte transactions going over the TC_EA_wrreq interface, over all TCC instances. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. |
+| `TCC_EA_WRREQ_64B_sum` | Total number of 64-byte transactions (write or `CMPSWAP`) going over the TC_EA_wrreq interface, over all TCC instances. |
+| `TCC_EA_WR_UNCACHED_32B_sum` | Total Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic, over all TCC instances. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request is counted as 2. |
+| `TCC_EA_WRREQ_STALL_sum` | Total Number of cycles a write request is stalled, over all instances. |
+| `TCC_EA_WRREQ_IO_CREDIT_STALL_sum` | Total number of cycles an EA write request is stalled due to the interface running out of IO credits, over all instances. |
+| `TCC_EA_WRREQ_GMI_CREDIT_STALL_sum` | Total number of cycles an EA write request is stalled due to the interface running out of GMI credits, over all instances. |
+| `TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum` | Total number of cycles an EA write request is stalled due to the interface running out of DRAM credits, over all instances. |
+| `TCC_EA_WRREQ_LEVEL_sum` | Total number of EA write requests in flight over all TCC instances. |
+| `TCC_EA_RDREQ_LEVEL_sum` | Total number of EA read requests in flight over all TCC instances. |
+| `TCC_EA_ATOMIC_sum` | Total Number of 32-byte or 64-byte atomic requests going over the TC_EA_wrreq interface, over all TCC instances. |
+| `TCC_EA_ATOMIC_LEVEL_sum` | Total number of EA atomic requests in flight, over all TCC instances. |
+| `TCC_EA_RDREQ_sum` | Total number of 32-byte or 64-byte read requests to EA, over all TCC instances. |
+| `TCC_EA_RDREQ_32B_sum` | Total number of 32-byte read requests to EA, over all TCC instances. |
+| `TCC_EA_RD_UNCACHED_32B_sum` | Total number of 32-byte EA reads due to uncached traffic, over all TCC instances. |
+| `TCC_EA_RDREQ_IO_CREDIT_STALL_sum` | Total number of cycles there is a stall due to the read request interface running out of IO credits, over all TCC instances. |
+| `TCC_EA_RDREQ_GMI_CREDIT_STALL_sum` | Total number of cycles there is a stall due to the read request interface running out of GMI credits, over all TCC instances. |
+| `TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum` | Total number of cycles there is a stall due to the read request interface running out of DRAM credits, over all TCC instances. |
+| `TCC_EA_RDREQ_DRAM_sum` | Total number of 32-byte or 64-byte EA read requests to HBM, over all TCC instances. |
+| `TCC_EA_WRREQ_DRAM_sum` | Total number of 32-byte or 64-byte EA write requests to HBM, over all TCC instances. |
+| `TCC_HIT_sum` | Total number of L2 cache hits over all TCC instances. |
+| `TCC_MISS_sum` | Total number of L2 cache misses over all TCC instances. |
+| `TCC_NC_REQ_sum` | Total number of NC requests over all TCC instances. |
+| `TCC_NORMAL_WRITEBACK_sum` | Total number of writebacks due to requests that are not writeback requests, over all TCC instances. |
+| `TCC_NORMAL_EVICT_sum` | Total number of evictions due to requests that are not invalidate or probe requests, over all TCC instances. |
+| `TCC_PROBE_sum` | Total number of probe requests over all TCC instances. |
+| `TCC_PROBE_ALL_sum` | Total number of external probe requests with EA_TCC_preq_all== 1, over all TCC instances. |
+| `TCC_READ_sum` | Total number of L2 cache read requests (including compressed reads but not metadata reads) over all TCC instances. |
+| `TCC_REQ_sum` | Total number of all types of L2 cache requests over all TCC instances. |
+| `TCC_RW_REQ_sum` | Total number of RW requests over all TCC instances. |
+| `TCC_STREAMING_REQ_sum` | Total number of L2 cache streaming requests over all TCC instances. |
+| `TCC_TAG_STALL_sum` | Total number of cycles the normal request pipeline in the tag is stalled for any reason, over all TCC instances. |
+| `TCC_TOO_MANY_EA_WRREQS_STALL_sum` | Total number of cycles L2 cache is unable to send an EA write request due to it reaching its maximum capacity of pending EA write requests, over all TCC instances. |
+| `TCC_UC_REQ_sum` | Total number of UC requests over all TCC instances. |
+| `TCC_WRITE_sum` | Total number of L2 cache write requests over all TCC instances. |
+| `TCC_WRITEBACK_sum` | Total number of lines written back to the main memory including writebacks of dirty lines and uncached write/atomic requests, over all TCC instances. |
+| `TCC_WRREQ_STALL_max` | Maximum number of cycles a write request is stalled, over all TCC instances. |
+| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on an atomic, over all TCP instances. |
+| `TCP_GATE_EN1_sum` | Total number of cycles vL1D interface clocks are turned on, over all TCP instances. |
+| `TCP_GATE_EN2_sum` | Total number of cycles vL1D core clocks are turned on, over all TCP instances. |
+| `TCP_PENDING_STALL_CYCLES_sum` | Total number of cycles vL1D cache is stalled due to data pending from L2 Cache, over all TCP instances. |
+| `TCP_READ_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a read, over all TCP instances. |
+| `TCP_TA_TCP_STATE_READ_sum` | Total number of state reads by all TCP instances. |
+| `TCP_TCC_ATOMIC_WITH_RET_REQ_sum` | Total number of atomic requests to L2 cache with return, over all TCP instances. |
+| `TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum` | Total number of atomic requests to L2 cache without return, over all TCP instances. |
+| `TCP_TCC_CC_READ_REQ_sum` | Total number of CC read requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_CC_WRITE_REQ_sum` | Total number of CC write requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_CC_ATOMIC_REQ_sum` | Total number of CC atomic requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_NC_READ_REQ_sum` | Total number of NC read requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_NC_WRITE_REQ_sum` | Total number of NC write requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_NC_ATOMIC_REQ_sum` | Total number of NC atomic requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_READ_REQ_LATENCY_sum` | Total vL1D to L2 request latency over all wavefronts for reads and atomics with return for all TCP instances. |
+| `TCP_TCC_READ_REQ_sum` | Total number of read requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_RW_READ_REQ_sum` | Total number of RW read requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_RW_WRITE_REQ_sum` | Total number of RW write requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_RW_ATOMIC_REQ_sum` | Total number of RW atomic requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_UC_READ_REQ_sum` | Total number of UC read requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_UC_WRITE_REQ_sum` | Total number of UC write requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_UC_ATOMIC_REQ_sum` | Total number of UC atomic requests to L2 cache, over all TCP instances. |
+| `TCP_TCC_WRITE_REQ_LATENCY_sum` | Total vL1D to L2 request latency over all wavefronts for writes and atomics without return for all TCP instances. |
+| `TCP_TCC_WRITE_REQ_sum` | Total number of write requests to L2 cache, over all TCP instances. |
+| `TCP_TCP_LATENCY_sum` | Total wave access latency to vL1D over all wavefronts for all TCP instances. |
+| `TCP_TCR_TCP_STALL_CYCLES_sum` | Total number of cycles TCR stalls vL1D, over all TCP instances. |
+| `TCP_TD_TCP_STALL_CYCLES_sum` | Total number of cycles TD stalls vL1D, over all TCP instances. |
+| `TCP_TOTAL_ACCESSES_sum` | Total number of vL1D accesses, over all TCP instances. |
+| `TCP_TOTAL_READ_sum` | Total number of vL1D read accesses, over all TCP instances. |
+| `TCP_TOTAL_WRITE_sum` | Total number of vL1D write accesses, over all TCP instances. |
+| `TCP_TOTAL_ATOMIC_WITH_RET_sum` | Total number of vL1D atomic requests with return, over all TCP instances. |
+| `TCP_TOTAL_ATOMIC_WITHOUT_RET_sum` | Total number of vL1D atomic requests without return, over all TCP instances. |
+| `TCP_TOTAL_CACHE_ACCESSES_sum` | Total number of vL1D cache accesses (including hits and misses) by all TCP instances. |
+| `TCP_TOTAL_WRITEBACK_INVALIDATES_sum` | Total number of vL1D writebacks and invalidates, over all TCP instances. |
+| `TCP_UTCL1_PERMISSION_MISS_sum` | Total number of UTCL1 permission misses by all TCP instances. |
+| `TCP_UTCL1_REQUEST_sum` | Total number of address translation requests to UTCL1 by all TCP instances. |
+| `TCP_UTCL1_TRANSLATION_MISS_sum` | Total number of UTCL1 translation misses by all TCP instances. |
+| `TCP_UTCL1_TRANSLATION_HIT_sum` | Total number of UTCL1 translation hits by all TCP instances. |
+| `TCP_VOLATILE_sum` | Total number of L1 volatile pixels/buffers from TA, over all TCP instances. |
+| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a write, over all TCP instances. |
+| `TD_ATOMIC_WAVEFRONT_sum` | Total number of atomic wavefront instructions, over all TD instances. |
+| `TD_COALESCABLE_WAVEFRONT_sum` | Total number of coalescable wavefronts according to TA, over all TD instances. |
+| `TD_LOAD_WAVEFRONT_sum` | Total number of wavefront instructions (read/write/atomic), over all TD instances. |
+| `TD_SPI_STALL_sum` | Total number of cycles TD is stalled by SPI, over all TD instances. |
+| `TD_STORE_WAVEFRONT_sum` | Total number of write wavefront instructions, over all TD instances. |
+| `TD_TC_STALL_sum` | Total number of cycles TD is stalled waiting for TC data, over all TD instances. |
+| `TD_TD_BUSY_sum` | Total number of TD busy cycles while it is processing or waiting for data, over all TD instances. |
+| `VALUBusy`         | Percentage of GPU time vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). |
+| `VALUInsts` | Average number of vector ALU instructions executed per work item (affected by flow control). |
+| `VALUUtilization`  | Percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). |
+| `VFetchInsts`      | Average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.               |
+| `VWriteInsts`      | Average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.                 |
+| `Wavefronts` | Total wavefronts. |
+| `WRITE_REQ_32B` | Total number of 32-byte effective memory writes. |
+| `WriteSize` | Total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. |
+| `WriteUnitStalled` | Percentage of GPU time the write unit is stalled. Value range: 0% to 100% (bad).      |
+
+## Abbreviations
+
+| Abbreviation | Meaning                                                                           |
+|:------------|:--------------------------------------------------------------------------------|
+| `ALU`          | Arithmetic Logic Unit                                                             |
+| `Arb`          | Arbiter                                                                           |
+| `BF16`         | Brain Floating Point - 16 bits                                                    |
+| `CC`           | Coherently Cached                                                                 |
+| `CP`           | Command Processor                                                                 |
+| `CPC`          | Command Processor - Compute                                                       |
+| `CPF`          | Command Processor - Fetcher                                                       |
+| `CS`           | Compute Shader                                                                    |
+| `CSC`          | Compute Shader Controller                                                         |
+| `CSn`          | Compute Shader, the n-th pipe                                                     |
+| `CU`           | Compute Unit                                                                      |
+| `DW`           | 32-bit Data Word, DWORD                                                           |
+| `EA`           | Efficiency Arbiter                                                                |
+| `F16`          | Half Precision Floating Point                                                     |
+| `F32`          | Full Precision Floating Point                                                     |
+| `FLAT`         | FLAT instructions allow read/write/atomic access to a generic memory address pointer, which can resolve to any of the following physical memories:<br>.   Global Memory<br>.   Scratch ("private")<br>.   LDS ("shared")<br>.   Invalid - MEM_VIOL TrapStatus |
+| `FMA`          | Fused Multiply Add                                                                |
+| `GDS`          | Global Data Share                                                                 |
+| `GRBM`         | Graphics Register Bus Manager                                                     |
+| `HBM`          | High Bandwidth Memory                                                             |
+| `Instr`        | Instructions                                                                      |
+| `IOP`          | Integer Operation                                                                 |
+| `L2`           | Level-2 Cache                                                                     |
+| `LDS`          | Local Data Share                                                                  |
+| `ME1`          | Micro Engine, running packet processing firmware on CPC                           |
+| `MFMA`         | Matrix Fused Multiply Add                                                         |
+| `NC`           | Noncoherently Cached                                                              |
+| `RW`           | Coherently Cached with Write                                                      |
+| `SALU`         | Scalar ALU                                                                        |
+| `SGPR`         | Scalar General Purpose Register                                                   |
+| `SIMD`         | Single Instruction Multiple Data                                                  |
+| `sL1D`         | Scalar Level-1 Data Cache                                                         |
+| `SMEM`         | Scalar Memory                                                                     |
+| `SPI`          | Shader Processor Input                                                            |
+| `SQ`           | Sequencer                                                                         |
+| `TA`           | Texture Addressing Unit                                                           |
+| `TC`           | Texture Cache                                                                     |
+| `TCA`          | Texture Cache Arbiter                                                             |
+| `TCC`          | Texture Cache per Channel, known as L2 Cache                                      |
+| `TCIU`         | Texture Cache Interface Unit (interface between CP and the memory system) |
+| `TCP`          | Texture Cache per Pipe, known as vector L1 Cache                                  |
+| `TCR`          | Texture Cache Router                                                              |
+| `TD`           | Texture Data Unit                                                                 |
+| `UC`           | Uncached                                                                          |
+| `UTCL1`        | Unified Translation Cache - Level 1                                               |
+| `UTCL2`        | Unified Translation Cache - Level 2                                               |
+| `VALU`         | Vector ALU                                                                        |
+| `VGPR`         | Vector General Purpose Register                                                   |
+| `vL1D`         | Vector Level -1 Data Cache                                                        |
+| `VMEM`         | Vector Memory                                                                     |
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -1,19 +1,19 @@
-# AMD Instinct Hardware
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="AMD Instinct MI250 microarchitecture">
+  <meta name="keywords" content="Instinct, MI250, microarchitecture, AMD, ROCm">
+</head>

-This chapter briefly reviews hardware aspects of the AMD Instinct MI250
-accelerators and the CDNA™ 2 architecture that is the foundation of these GPUs.
+# AMD Instinct™ MI250 microarchitecture

-## AMD CDNA 2 Micro-architecture
-
-The micro-architecture of the AMD Instinct MI250 accelerators is based on the
+The microarchitecture of the AMD Instinct MI250 accelerators is based on the
 AMD CDNA 2 architecture that targets compute applications such as HPC,
-artificial intelligence (AI), and Machine Learning (ML) and that run on
+artificial intelligence (AI), and machine learning (ML) and that run on
 everything from individual servers to the world’s largest exascale
 supercomputers. The overall system architecture is designed for extreme
 scalability and compute performance.

-{numref}`mi250-gcd` shows the components of a single Graphics Compute Die (GCD
-) of the CDNA 2 architecture. On the top and the bottom are AMD Infinity Fabric™
+The following image shows the components of a single Graphics Compute Die (GCD) of the CDNA 2 architecture. On the top and the bottom are AMD Infinity Fabric™
 interfaces and their physical links that are used to connect the GPU die to the
 other system-level components of the node (see also Section 2.2). Both
 interfaces can drive four AMD Infinity Fabric links. One of the AMD Infinity
@@ -28,27 +28,22 @@ To the left and the right are memory controllers that attach the High Bandwidth
 Memory (HBM) modules to the GCD. AMD Instinct MI250 GPUs use HBM2e, which offers
 a peak memory bandwidth of 1.6 TB/sec per GCD.

-The execution units of the GPU are depicted in {numref}`mi250-gcd` as Compute
+The execution units of the GPU are depicted in the following image as Compute
 Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
 subdivided into four SIMD units that process SIMD instructions of 16 data
 elements per instruction (for the FP64 data type). This enables the CU to
 process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
-GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
-TFLOPS for vector instructions. The MI250 compute units also provide specialized
+GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
+TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
 execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.

-:::{figure-md} mi250-gcd
-
-<img src="../../data/reference/gpu_arch/image.001.png" alt="Structure of a single GCD in the AMD Instinct MI250 accelerator.">
-
-Figure 1: Structure of a single GCD in the AMD Instinct MI250 accelerator.
-:::
+![Structure of a single GCD in the AMD Instinct MI250 accelerator.](../../data/conceptual/gpu-arch/image001.png "Structure of a single GCD in the AMD Instinct MI250 accelerator.")

 ```{list-table} Peak-performance capabilities of the MI250 OAM for different data types.
 :header-rows: 1
-:name: mi250-perf
+:name: mi250-perf-table

 *
  - Computation and Data Type
@@ -88,7 +83,7 @@ Figure 1: Structure of a single GCD in the AMD Instinct MI250 accelerator.
  - 362.1
 ```

-{numref}`mi250-perf` summarizes the aggregated peak performance of the AMD
+The above table summarizes the aggregated peak performance of the AMD
 Instinct MI250 OCP Open Accelerator Modules (OAM, OCP is short for Open Compute
 Platform) and its two GCDs for different data types and execution units. The
 middle column lists the peak performance (number of data elements processed in a
@@ -97,23 +92,18 @@ is being retired in each clock cycle. The third column lists the theoretical
 peak performance of the OAM module. The theoretical aggregated peak memory
 bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).

-:::{figure-md} mi250-arch
+![Dual-GCD architecture of the AMD Instinct MI250 accelerators](../../data/conceptual/gpu-arch/image002.png "Dual-GCD architecture of the AMD Instinct MI250 accelerators")

-<img src="../../data/reference/gpu_arch/image.002.png" alt="Dual-GCD architecture of the AMD Instinct MI250 accelerators.">
-
-Dual-GCD architecture of the AMD Instinct MI250 accelerators.
-:::
-
-{numref}`mi250-arch` shows the block diagram of an OAM package that consists
+The following image shows the block diagram of an OAM package that consists
 of two GCDs, each of which constitutes one GPU device in the system. The two
 GCDs in the package are connected via four AMD Infinity Fabric links running at
 a theoretical peak rate of 25 GT/sec, giving 200 GB/sec peak transfer bandwidth
 between the two GCDs of an OAM, or a bidirectional peak transfer bandwidth of
 400 GB/sec for the same.

-## Node-level Architecture
+## Node-level architecture

-{numref}`mi250-block` shows the node-level architecture of a system that is
+The following image shows the node-level architecture of a system that is
 based on the AMD Instinct MI250 accelerator. The MI250 OAMs attach to the host
 system via PCIe Gen 4 x16 links (yellow lines). Each GCD maintains its own PCIe
 x16 link to the host part of the system. Depending on the server platform, the
@@ -121,15 +111,9 @@ GCD can attach to the AMD EPYC processor directly or via an optional PCIe switch
 . Note that some platforms may offer an x8 interface to the GCDs, which reduces
 the available host-to-GPU bandwidth.

-:::{figure-md} mi250-block
+![Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor](../../data/conceptual/gpu-arch/image003.png "Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor")

-<img src="../../data/reference/gpu_arch/image.003.png" alt="Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor.">
-
-Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation
-AMD EPYC processor.
-:::
-
-{numref}`mi250-block` shows the node-level architecture of a system with AMD
+The preceding image shows the node-level architecture of a system with AMD
 EPYC processors in a dual-socket configuration and four AMD Instinct MI250
 accelerators. The MI250 OAMs attach to the host processors system via PCIe Gen 4
 x16 links (yellow lines). Depending on the system design, a PCIe switch may
@@ -146,4 +130,4 @@ two GPU dies in the MI250 OAM and operates at 25 GT/sec, which corresponds to a
 theoretical peak transfer rate of 50 GB/sec per link (or 100 GB/sec
 bidirectional peak transfer bandwidth). The GCD pairs 2 and 6 as well as GCDs 0
 and 4 connect via two XGMI links, which is indicated by the thicker red line in
-{numref}`mi250-block`.
+the preceding image.
--- a/docs/conceptual/gpu-isolation.md
+++ b/docs/conceptual/gpu-isolation.md
@@ -1,4 +1,11 @@
-# GPU Isolation Techniques
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU isolation techniques">
+  <meta name="keywords" content="GPU isolation techniques, UUID, universally unique identifier,
+  environment variables, virtual machines, AMD, ROCm">
+</head>
+
+# GPU isolation techniques

 Restricting the access of applications to a subset of GPUs, aka isolating
 GPUs allows users to hide GPU resources from programs. The programs by default
@@ -8,7 +15,7 @@ There are multiple ways to achieve isolation of GPUs in the ROCm software stack,
 differing in which applications they apply to and the security they provide.
 This page serves as an overview of the techniques.

-## Environment Variables
+## Environment variables

 The runtimes in the ROCm software stack read these environment variables to
 select the exposed or default device to present to applications using them.
@@ -22,7 +29,7 @@ A list of device indices or {abbr}`UUID (universally unique identifier)`s
 that will be exposed to applications.

 Runtime
-: ROCm Platform Runtime. Applies to all applications using the user mode ROCm
+: ROCm Software Runtime. Applies to all applications using the user mode ROCm
  software stack.

 ```{code-block} shell
@@ -43,12 +50,13 @@ Runtime
 export GPU_DEVICE_ORDINAL="0,2"
 ```

+(hip_visible_devices)=
+
 ### `HIP_VISIBLE_DEVICES`

 Device indices exposed to HIP applications.

-Runtime
-: HIP Runtime. Applies only to applications using HIP on the AMD platform.
+Runtime: HIP runtime. Applies only to applications using HIP on the AMD platform.

 ```{code-block} shell
 :caption: Example to expose the 1. and 3. devices in the system.
@@ -90,7 +98,7 @@ to all programs that use the `amdgpu` kernel module interfaces.
 Even programs that don't use the ROCm runtime, like graphics applications
 using OpenGL or Vulkan, can only access the GPUs exposed to the container.

-## GPU Passthrough to Virtual Machines
+## GPU passthrough to virtual machines

 Virtual machines achieve the highest level of isolation, because even the kernel
 of the virtual machine is isolated from the host. Devices physically installed
--- a/docs/conceptual/gpu-memory.md
+++ b/docs/conceptual/gpu-memory.md
@@ -0,0 +1,241 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="GPU memory">
+  <meta name="keywords" content="GPU memory, VRAM, video random access memory, pageable
+  memory, pinned memory, managed memory, AMD, ROCm">
+</head>
+
+# GPU memory
+
+For the HIP reference documentation, see:
+
+* {doc}`hip:doxygen/html/group___memory`
+* {doc}`hip:doxygen/html/group___memory_m`
+
+Host memory exists on the host (e.g. CPU) of the machine in random access memory (RAM).
+
+Device memory exists on the device (e.g. GPU) of the machine in video random access memory (VRAM).
+Recent architectures use graphics double data rate (GDDR) synchronous dynamic random-access memory (SDRAM)such as GDDR6, or high-bandwidth memory (HBM) such as HBM2e.
+
+## Memory allocation
+
+Memory can be allocated in two ways: pageable memory, and pinned memory.
+The following API calls with result in these allocations:
+
+| API                | Data location | Allocation |
+|--------------------|---------------|------------|
+| System allocated   | Host          | Pageable   |
+| `hipMallocManaged` | Host          | Managed    |
+| `hipHostMalloc`    | Host          | Pinned     |
+| `hipMalloc`        | Device        | Pinned     |
+
+:::{tip}
+`hipMalloc` and `hipFree` are blocking calls, however, HIP recently added non-blocking versions `hipMallocAsync` and `hipFreeAsync` which take in a stream as an additional argument.
+:::
+
+### Pageable memory
+
+Pageable memory is usually gotten when calling `malloc` or `new` in a C++ application.
+It is unique in that it exists on "pages" (blocks of memory), which can be migrated to other memory storage.
+For example, migrating memory between CPU sockets on a motherboard, or a system that runs out of space in RAM and starts dumping pages of RAM into the swap partition of your hard drive.
+
+### Pinned memory
+
+Pinned memory (or page-locked memory, or non-pageable memory) is host memory that is mapped into the address space of all GPUs, meaning that the pointer can be used on both host and device.
+Accessing host-resident pinned memory in device kernels is generally not recommended for performance, as it can force the data to traverse the host-device interconnect (e.g. PCIe), which is much slower than the on-device bandwidth (>40x on MI200).
+
+Pinned host memory can be allocated with one of two types of coherence support:
+
+:::{note}
+In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault`).
+There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
+On MI200 these options do not impact performance.
+<!-- TODO: link to programming_manual#memory-allocation-flags -->
+For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
+:::
+
+Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.
+On multi-socket systems it is important to ensure that pinned memory is located on the same socket as the owning process, or else each cache line will be moved through the CPU-CPU interconnect, thereby increasing latency and potentially decreasing bandwidth.
+
+In practice, pinned memory is used to improve transfer times between host and device.
+For transfer operations, such as `hipMemcpy` or `hipMemcpyAsync`, using pinned memory instead of pageable memory on host can lead to a ~3x improvement in bandwidth.
+
+:::{tip}
+If the application needs to move data back and forth between device and host (separate allocations), use pinned memory on the host side.
+:::
+
+### Managed memory
+
+Managed memory refers to universally addressable, or unified memory available on the MI200 series of GPUs.
+Much like pinned memory, managed memory shares a pointer between host and device and (by default) supports fine-grained coherence, however, managed memory can also automatically migrate pages between host and device.
+The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism.
+
+If heterogenous memory management (HMM) is not available, then `hipMallocManaged` will default back to using system memory and will act like pinned host memory.
+Other managed memory API calls will have undefined behavior.
+It is therefore recommended to check for managed memory capability with: `hipDeviceGetAttribute` and `hipDeviceAttributeManagedMemory`.
+
+HIP supports additional calls that work with page migration:
+
+* `hipMemAdvise`
+* `hipMemPrefetchAsync`
+
+:::{tip}
+If the application needs to use data on both host and device regularly, does not want to deal with separate allocations, and is not worried about maxing out the VRAM on MI200 GPUs (64 GB per GCD), use managed memory.
+:::
+
+:::{tip}
+If managed memory performance is poor, check to see if managed memory is supported on your system and if page migration (XNACK) is enabled.
+:::
+
+## Access behavior
+
+Memory allocations for GPUs behave as follow:
+
+| API                | Data location | Host access  | Device access        |
+|--------------------|---------------|--------------|----------------------|
+| System allocated   | Host          | Local access | Unhandled page fault |
+| `hipMallocManaged` | Host          | Local access | Zero-copy            |
+| `hipHostMalloc`    | Host          | Local access | Zero-copy*           |
+| `hipMalloc`        | Device        | Zero-copy    | Local access         |
+
+Zero-copy accesses happen over the Infinity Fabric interconnect or PCI-E lanes on discrete GPUs.
+
+:::{note}
+While `hipHostMalloc` allocated memory is accessible by a device, the host pointer must be converted to a device pointer with `hipHostGetDevicePointer`.
+
+Memory allocated through standard system allocators such as `malloc`, can be accessed a device by registering the memory via `hipHostRegister`.
+The device pointer to be used in kernels can be retrieved with `hipHostGetDevicePointer`.
+Registered memory is treated like `hipHostMalloc` and will have similar performance.
+
+On devices that support and have [](#xnack) enabled, such as the MI250X, `hipHostRegister` is not required as memory accesses are handled via automatic page migration.
+:::
+
+### XNACK
+
+Normally, host and device memory are separate and data has to be transferred manually via `hipMemcpy`.
+
+On a subset of GPUs, such as the MI200, there is an option to automatically migrate pages of memory between host and device.
+This is important for managed memory, where the locality of the data is important for performance.
+Depending on the system, page migration may be disabled by default in which case managed memory will act like pinned host memory and suffer degraded performance.
+
+*XNACK* describes the GPUs ability to retry memory accesses that failed due a page fault (which normally would lead to a memory access error), and instead retrieve the missing page.
+
+This also affects memory allocated by the system as indicated by the following table:
+
+| API                | Data location | Host after device access | Device after host access |
+|--------------------|---------------|--------------------------|--------------------------|
+| System allocated   | Host          | Migrate page to host     | Migrate page to device   |
+| `hipMallocManaged` | Host          | Migrate page to host     | Migrate page to device   |
+| `hipHostMalloc`    | Host          | Local access             | Zero-copy                |
+| `hipMalloc`        | Device        | Zero-copy                | Local access             |
+
+To check if page migration is available on a platform, use `rocminfo`:
+
+```sh
+$ rocminfo | grep xnack
+      Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack-
+```
+
+Here, `xnack-` means that XNACK is available but is disabled by default.
+Turning on XNACK by setting the environment variable `HSA_XNACK=1` and gives the expected result, `xnack+`:
+
+```sh
+$ HSA_XNACK=1 rocminfo | grep xnack
+Name:                    amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack+
+```
+
+`hipcc`by default will generate code that runs correctly with both XNACK enabled or disabled.
+Setting the `--offload-arch=`-option with `xnack+` or `xnack-` forces code to be only run with XNACK enabled or disabled respectively.
+
+```sh
+# Compiled kernels will run regardless if XNACK is enabled or is disabled. 
+hipcc --offload-arch=gfx90a
+
+# Compiled kernels will only be run if XNACK is enabled with XNACK=1.
+hipcc --offload-arch=gfx90a:xnack+
+
+# Compiled kernels will only be run if XNACK is disabled with XNACK=0.
+hipcc --offload-arch=gfx90a:xnack-
+```
+
+:::{tip}
+If you want to make use of page migration, use managed memory. While pageable memory will migrate correctly, it is not a portable solution and can have performance issues if the accessed data isn't page aligned.
+:::
+
+### Coherence
+
+* *Coarse-grained coherence* means that memory is only considered up to date at kernel boundaries, which can be enforced through `hipDeviceSynchronize`, `hipStreamSynchronize`, or any blocking operation that acts on the null stream (e.g. `hipMemcpy`).
+For example, cacheable memory is a type of coarse-grained memory where an up-to-date copy of the data can be stored elsewhere (e.g. in an L2 cache).
+* *Fine-grained coherence* means the coherence is supported while a CPU/GPU kernel is running.
+This can be useful if both host and device are operating on the same dataspace using system-scope atomic operations (e.g. updating an error code or flag to a buffer).
+Fine-grained memory implies that up-to-date data may be made visible to others regardless of kernel boundaries as discussed above.
+
+| API                     | Flag                         | Coherence      |
+|-------------------------|------------------------------|----------------|
+| `hipHostMalloc`         | `hipHostMallocDefault`       | Fine-grained   |
+| `hipHostMalloc`         | `hipHostMallocNonCoherent`   | Coarse-grained |
+
+| API                     | Flag                         | Coherence      |
+|-------------------------|------------------------------|----------------|
+| `hipExtMallocWithFlags` | `hipHostMallocDefault`       | Fine-grained   |
+| `hipExtMallocWithFlags` | `hipDeviceMallocFinegrained` | Coarse-grained |
+
+| API                     | `hipMemAdvise` argument      | Coherence      |
+|-------------------------|------------------------------|----------------|
+| `hipMallocManaged`      |                              | Fine-grained   |
+| `hipMallocManaged`      | `hipMemAdviseSetCoarseGrain` | Coarse-grained |
+| `malloc`                |                              | Fine-grained   |
+| `malloc`                | `hipMemAdviseSetCoarseGrain` | Coarse-grained |
+
+:::{tip}
+Try to design your algorithms to avoid host-device memory coherence (e.g. system scope atomics). While it can be a useful feature in very specific cases, it is not supported on all systems, and can negatively impact performance by introducing the host-device interconnect bottleneck.
+:::
+
+The availability of fine- and coarse-grained memory pools can be checked with `rocminfo`:
+
+```sh
+$ rocminfo
+...
+*******
+Agent 1
+*******
+Name:                    AMD EPYC 7742 64-Core Processor
+...
+Pool Info:
+Pool 1
+Segment:                 GLOBAL; FLAGS: FINE GRAINED
+...
+Pool 3
+Segment:                 GLOBAL; FLAGS: COARSE GRAINED
+...
+*******
+Agent 9
+*******
+Name:                    gfx90a
+...
+Pool Info:
+Pool 1
+Segment:                 GLOBAL; FLAGS: COARSE GRAINED
+...
+```
+
+## System direct memory access
+
+In most cases, the default behavior for HIP in transferring data from a pinned host allocation to device will run at the limit of the interconnect.
+However, there are certain cases where the interconnect is not the bottleneck.
+
+The primary way to transfer data onto and off of a GPU, such as the MI200, is to use the onboard System Direct Memory Access engine, which is used to feed blocks of memory to the off-device interconnect (either GPU-CPU or GPU-GPU).
+Each GCD has a separate SDMA engine for host-to-device and device-to-host memory transfers.
+Importantly, SDMA engines are separate from the computing infrastructure, meaning that memory transfers to and from a device will not impact kernel compute performance, though they do impact memory bandwidth to a limited extent.
+The SDMA engines are mainly tuned for PCIe-4.0 x16, which means they are designed to operate at bandwidths up to 32 GB/s.
+
+:::{note}
+An important feature of the MI250X platform is the Infinity Fabric™ interconnect between host and device.
+The Infinity Fabric interconnect supports improved performance over standard PCIe-4.0 (usually ~50% more bandwidth); however, since the SDMA engine does not run at this speed, it will not max out the bandwidth of the faster interconnect.
+:::
+
+The bandwidth limitation can be countered by bypassing the SDMA engine and replacing it with a type of copy kernel known as a "blit" kernel.
+Blit kernels will use the compute units on the GPU, thereby consuming compute resources, which may not always be beneficial.
+The easiest way to enable blit kernels is to set an environment variable `HSA_ENABLE_SDMA=0`, which will disable the SDMA engine.
+On systems where the GPU uses a PCIe interconnect instead of an Infinity Fabric interconnect, blit kernels will not impact bandwidth, but will still consume compute resources.
+The use of SDMA vs blit kernels also applies to MPI data transfers and GPU-GPU transfers.
--- a/docs/conceptual/using-gpu-sanitizer.md
+++ b/docs/conceptual/using-gpu-sanitizer.md
@@ -0,0 +1,427 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Using the LLVM ASan on a GPU">
+  <meta name="keywords" content="LLVM, ASan, address sanitizer, AddressSanitizer, instrumented
+  libraries, instrumented applications, AMD, ROCm">
+</head>
+
+# Using the AddressSanitizer on a GPU (beta release)
+
+The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.
+Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
+This document provides documentation on using ROCm ASan.
+
+For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).
+
+:::{note}
+The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
+:::
+
+## Compiling for ASan
+
+The ASan process begins by compiling the application of interest with the ASan instrumentation.
+
+Recommendations for doing this are:
+
+* Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
+* Add the following options to the existing compiler and linker options:
+  
+  * `-fsanitize=address` - enables instrumentation
+
+  * `-shared-libsan` - use shared version of runtime
+
+  * `-g` - add debug info for improved reporting
+
+* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
+
+Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.
+
+:::{tip}
+It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
+:::
+
+:::{note}
+When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
+:::
+
+### About compilation time
+
+When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all downstream components of the compiler toolchain and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances raised the compile time to an unacceptable level.
+
+There are a few options if the compile time becomes unacceptable:
+
+* Avoid instrumentation of the files which have the worst compile times. This will reduce the effectiveness of the ASan process.
+* Add the option `-fsanitize-recover=address` to the compiles with the worst compile times. This option simplifies the added instrumentation resulting in faster compilation. See below for more information.
+* Disable instrumentation on a per-function basis by adding `__attribute__`((no_sanitize("address"))) to functions found to be responsible for the large compile time. Again, this will reduce the effectiveness of the process.
+
+## Installing ROCm GPU ASan packages
+
+For a complete ROCm GPU Sanitizer installation, including packages, instrumented HSA and HIP runtimes, tools, and math libraries, use the following instruction,
+
+```bash
+    sudo apt-get install rocm-ml-sdk-asan
+
+```
+
+## Using AMD-supplied ASan instrumented libraries
+
+ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
+These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with `g++`. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
+
+These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
+
+When adjusting an application build to add instrumentation, linking against these instrumented libraries is unnecessary. For example, any `-L` `/opt/rocm-<version>/lib` compiler options need not be changed. However, the instrumented libraries should be used when the application is run. It is particularly important that the instrumented language runtimes, like `libamdhip64.so` and `librocm-core.so`, are used; otherwise, device invalid access detections may not be reported.
+
+## Running ASan instrumented applications
+
+### Preparing to run an instrumented application
+
+Here are a few recommendations to consider before running an ASan instrumented heterogeneous application.
+
+* Ensure the Linux kernel running on the system has Heterogeneous Memory Management (HMM) support. A kernel version of 5.6 or higher should be sufficient.
+* Ensure XNACK is enabled
+  * For `gfx90a` (MI-2X0) or `gfx940` (MI-3X0) use environment `HSA_XNACK = 1`.
+  * For `gfx906` (MI-50) or `gfx908` (MI-100) use environment `HSA_XNACK = 1` but also ensure the amdgpu kernel module is loaded with module argument `noretry=0`.
+This requirement is due to the fact that the XNACK setting for these GPUs is system-wide.
+
+* Ensure that the application will use the instrumented libraries when it runs. The output from the shell command `ldd <application name>` can be used to see which libraries will be used.
+If the instrumented libraries are not listed by `ldd`, the environment variable `LD_LIBRARY_PATH` may need to be adjusted, or in some cases an `RPATH` compiled into the application may need to be changed and the application recompiled.
+
+* Ensure that the application depends on the ASan runtime. This can be checked by running the command `readelf -d <application name> | grep NEEDED` and verifying that shared library: `libclang_rt.asan-x86_64.so` appears in the output.
+If it does not appear, when executed the application will quickly output an ASan error that looks like:
+
+```bash
+==3210==ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
+```
+
+* Ensure that the application `llvm-symbolizer` can be executed, and that it is located in `/opt/rocm-<version>/llvm/bin`. This executable is not strictly required, but if found is used to translate ("symbolize") a host-side instruction address into a more useful function name, file name, and line number (assuming the application has been built to include debug information).
+
+There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASan runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASan runtime. The device runtime only currently supports the default settings for the few relevant options.
+
+There are three `ASAN_OPTION` flags of note.
+
+* `halt_on_error=0/1 default 1`.
+
+  This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
+
+* `detect_leaks=0/1 default 1`.
+
+  This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSan). For heterogeneous applications, this default results in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSan suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
+
+* `quarantine_size_mb=N default 256`
+
+  This option defines the number of megabytes (MB) `N` of memory that the ASan runtime will hold after it is `freed` to detect use-after-free situations. This memory is unavailable for other purposes. The default of 256 MB may be too small to detect some use-after-free situations, especially given that the large size of many GPU memory allocations may push `freed` allocations out of quarantine before the attempted use.
+
+  :::{note}
+  Setting the value of `quarantine_size_mb` larger may enable more problematic uses to be detected, but at the cost of reducing memory available for other purposes.
+  :::
+
+## Runtime overhead
+
+Running an ASan instrumented application incurs
+overheads which may result in unacceptably long runtimes
+or failure to run at all.
+
+### Higher execution time
+
+ASan detection works by checking each address at runtime
+before the address is actually accessed by a load, store, or atomic
+instruction.
+This checking involves an additional load to "shadow" memory which
+records whether the address is "poisoned" or not, and additional logic
+that decides whether to produce an detection report or not.
+
+This extra runtime work can cause the application to slow down by
+a factor of three or more, depending on how many memory accesses are
+executed.
+For heterogeneous applications, the shadow memory must be accessible by all devices
+and this can mean that shadow accesses from some devices may be more costly
+than non-shadow accesses.
+
+### Higher memory use
+
+The address checking described above relies on the compiler to surround
+each program variable with a red zone and on ASan
+runtime to surround each runtime memory allocation with a red zone and
+fill the shadow corresponding to each red zone with poison.
+The added memory for the red zones is additional overhead on top
+of the 13% overhead for the shadow memory itself.
+
+Applications which consume most one or more available memory pools when
+run normally are likely to encounter allocation failures when run with
+instrumentation.
+
+## Runtime reporting
+
+It is not the intention of this document to provide a detailed explanation of all the types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.
+
+An invalid address detection report for the CPU always starts with
+
+```bash
+==<PID>==ERROR: AddressSanitizer: <problem type> on address <memory address> at pc <pc> bp <bp> sp <sp> <access> of size <N> at <memory address> thread T0
+```
+
+and continues with a stack trace for the access, a stack trace for the allocation and deallocation, if relevant, and a dump of the shadow near the <memory address>.
+
+In contrast, an invalid address detection report for the GPU always starts with
+
+```bash
+==<PID>==ERROR: AddressSanitizer: <problem type> on amdgpu device <device> at pc <pc> <access> of size <n> in workgroup id (<X>,<Y>,<Z>)
+```
+
+Above, `<device>` is the integer device ID, and `(<X>, <Y>, <Z>)` is the ID of the workgroup or block where the invalid address was detected.
+
+While the CPU report include a call stack for the thread attempting the invalid access, the GPU is currently to a call stack of size one, i.e. the (symbolized) of the invalid access, e.g.
+
+```bash
+#0 <pc> in <fuction signature> at /path/to/file.hip:<line>:<column>
+```
+
+This short call stack is followed by a GPU unique section that looks like
+
+```bash
+Thread ids and accessed addresses:
+<lid0> <maddr 0> : <lid1> <maddr1> : ...
+```
+
+where each `<lid j> <maddr j>` indicates the lane ID and the invalid memory address held by lane `j` of the wavefront attempting the invalid access.
+
+Additionally, reports for invalid GPU accesses to memory allocated by GPU code via `malloc` or new starting with, for example,
+
+```bash
+==1234==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fa9f5c92dcc
+```
+
+or
+
+```bash
+==5678==ERROR: AddressSanitizer: heap-use-after-free on amdgpu device 3 at pc 0x7f4c10062d74
+```
+
+currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.
+
+## Running ASan with `rocgdb`
+
+`rocgdb` can be used to further investigate ASan detected errors, with some preparation.
+
+Currently, the ASan runtime complains when starting `rocgdb` without preparation.
+
+```bash
+$ rocgdb my_app
+==1122==ASan` runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
+```
+
+This is solved by setting environment variable `LD_PRELOAD` to the path to the ASan runtime, whose path can be obtained using the command
+
+```bash
+amdclang++ -print-file-name=libclang_rt.asan-x86_64.so
+```
+
+You should also set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.
+
+After starting `rocgdb` breakpoints can be set on the ASan runtime error reporting entry points of interest. For example, if an ASan error report includes
+
+```bash
+WRITE of size 4 in workgroup id (10,0,0)
+```
+
+the `rocgdb` command needed to stop the program before the report is printed is
+
+```bash
+(gdb) break __asan_report_store4
+```
+
+Similarly, the appropriate command for a report including
+
+```bash
+READ of size <N> in workgroup ID (1,2,3)
+```
+
+is
+
+```bash
+(gdb) break __asan_report_load<N>
+```
+
+It is possible to set breakpoints on all ASan report functions using these commands:
+
+```bash
+$ rocgdb <path to application>
+(gdb) start <commmand line arguments>
+(gdb) rbreak ^__asan_report
+(gdb) c
+```
+
+## Using ASan with a short HIP application
+
+Consider the following simple and short demo of using the Address Sanitizer with a HIP application:
+
+```C++
+
+#include <cstdlib>
+#include <hip/hip_runtime.h>
+
+__global__ void
+set1(int *p)
+{
+    int i = blockDim.x*blockIdx.x + threadIdx.x;
+    p[i] = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    int m = std::atoi(argv[1]);
+    int n1 = std::atoi(argv[2]);
+    int n2 = std::atoi(argv[3]);
+    int c = std::atoi(argv[4]);
+    int *dp;
+    hipMalloc(&dp, m*sizeof(int));
+    hipLaunchKernelGGL(set1, dim3(n1), dim3(n2), 0, 0, dp);
+    int *hp = (int*)malloc(c * sizeof(int));
+    hipMemcpy(hp, dp, m*sizeof(int), hipMemcpyDeviceToHost);
+    hipDeviceSynchronize();
+    hipFree(dp);
+    free(hp);
+    std::puts("Done.");
+    return 0;
+}
+```
+
+This application will attempt to access invalid addresses for certain command line arguments. In particular, if `m < n1 * n2` some device threads will attempt to access
+unallocated device memory.
+
+Or, if `c < m`, the `hipMemcpy` function will copy past the end of the `malloc` allocated memory.
+
+**Note**: The `hipcc` compiler is used here for simplicity.
+
+Compiling without XNACK results in a warning.
+
+```bash
+$ hipcc -g --offload-arch=gfx90a:xnack- -fsanitize=address -shared-libsan mini.hip -o mini
+clang++: warning: ignoring` `-fsanitize=address' option for offload arch 'gfx90a:xnack-`, as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored]`.
+```
+
+The binary compiled above will run, but the GPU code will not be instrumented and the `m < n1 * n2` error will not be detected. Switching to `--offload-arch=gfx90a:xnack+` in the command above results in a warning-free compilation and an instrumented application. After setting `PATH`, `LD_LIBRARY_PATH` and `HSA_XNACK` as described earlier, a check of the binary with `ldd` yields the following,
+
+```bash
+$ ldd mini
+        linux-vdso.so.1 (0x00007ffd1a5ae000)
+        libclang_rt.asan-x86_64.so => /opt/rocm-6.1.0-99999/llvm/lib/clang/17.0.0/lib/linux/libclang_rt.asan-x86_64.so (0x00007fb9c14b6000)
+        libamdhip64.so.5 => /opt/rocm-6.1.0-99999/lib/asan/libamdhip64.so.5 (0x00007fb9bedd3000)
+        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fb9beba8000)
+        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fb9bea59000)
+        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fb9bea3e000)
+        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fb9be84a000)
+        libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fb9be844000)
+        libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fb9be821000)
+        librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fb9be817000)
+        libamd_comgr.so.2 => /opt/rocm-6.1.0-99999/lib/asan/libamd_comgr.so.2 (0x00007fb9b4382000)
+        libhsa-runtime64.so.1 => /opt/rocm-6.1.0-99999/lib/asan/libhsa-runtime64.so.1 (0x00007fb9b3b00000)
+        libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fb9b3af3000)
+        /lib64/ld-linux-x86-64.so.2 (0x00007fb9c2027000)
+        libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fb9b3ad7000)
+        libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fb9b3aa7000)
+        libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fb9b3a89000)
+        libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fb9b3a70000)
+        libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fb9b3a62000)
+
+```
+
+This confirms that the address sanitizer runtime is linked in, and the ASan instrumented version of the runtime libraries are used.
+Checking the `PATH` yields
+
+```bash
+$ which llvm-symbolizer
+/opt/rocm-6.1.0-99999/llvm/bin/llvm-symbolizer
+```
+
+Lastly, a check of the OS kernel version yields
+
+```bash
+$ uname -rv
+5.15.0-73-generic #80~20.04.1-Ubuntu SMP Wed May 17 14:58:14 UTC 2023
+```
+
+which indicates that the required HMM support (kernel version > 5.6) is available. This completes the necessary setup. Running with `m = 100`, `n1 = 11`, `n2 = 10` and `c = 100` should produce
+a report for an invalid access by the last 10 threads.
+
+```bash
+=================================================================
+==3141==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fb1410d2cc4
+WRITE of size 4 in workgroup id (10,0,0)
+  #0 0x7fb1410d2cc4 in set1(int*) at /home/dave/mini/mini.cpp:0:10
+
+Thread ids and accessed addresses:
+00 : 0x7fb14371d190 01 : 0x7fb14371d194 02 : 0x7fb14371d198 03 : 0x7fb14371d19c 04 : 0x7fb14371d1a0 05 : 0x7fb14371d1a4 06 : 0x7fb14371d1a8 07 : 0x7fb14371d1ac
+08 : 0x7fb14371d1b0 09 : 0x7fb14371d1b4
+
+0x7fb14371d190 is located 0 bytes after 400-byte region [0x7fb14371d000,0x7fb14371d190)
+allocated by thread T0 here:
+    #0 0x7fb151c76828 in hsa_amd_memory_pool_allocate /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:692:3
+    #1 ...
+
+    #12 0x7fb14fb99ec4 in hipMalloc /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:568:3
+    #13 0x226630 in hipError_t hipMalloc<int>(int**, unsigned long) /opt/rocm-6.1.0-99999/include/hip/hip_runtime_api.h:8367:12
+    #14 0x226630 in main /home/dave/mini/mini.cpp:19:5
+    #15 0x7fb14ef02082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+Shadow bytes around the buggy address:
+  0x7fb14371cf00: ...
+
+=>0x7fb14371d180: 00 00[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa
+  0x7fb14371d200: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==3141==ABORTING
+```
+
+Running with `m = 100`, `n1 = 10`, `n2 = 10` and `c = 99` should produce a report for an invalid copy.
+
+```shell
+=================================================================
+==2817==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x514000150dcc at pc 0x7f5509551aca bp 0x7ffc90a7ae50 sp 0x7ffc90a7a610
+WRITE of size 400 at 0x514000150dcc thread T0
+    #0 0x7f5509551ac9 in __asan_memcpy /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3
+    #1 ...
+
+    #9 0x7f5507462a28 in hipMemcpy_common(void*, void const*, unsigned long, hipMemcpyKind, ihipStream_t*) /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:637:10
+    #10 0x7f5507464205 in hipMemcpy /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:642:3
+    #11 0x226844 in main /home/dave/mini/mini.cpp:22:5
+    #12 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+    #13 0x22605d in _start (/home/dave/mini/mini+0x22605d)
+
+0x514000150dcc is located 0 bytes after 396-byte region [0x514000150c40,0x514000150dcc)
+allocated by thread T0 here:
+    #0 0x7f5509553dcf in malloc /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
+    #1 0x226817 in main /home/dave/mini/mini.cpp:21:21
+    #2 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
+
+SUMMARY: AddressSanitizer: heap-buffer-overflow /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3 in __asan_memcpy
+Shadow bytes around the buggy address:
+  0x514000150b00: ...
+
+=>0x514000150d80: 00 00 00 00 00 00 00 00 00[04]fa fa fa fa fa fa
+  0x514000150e00: ...
+
+Shadow byte legend (one shadow byte represents 8 application bytes):
+  Addressable:           00
+  Partially addressable: 01 02 03 04 05 06 07
+  Heap left redzone:       fa
+  ...
+==2817==ABORTING
+```
+
+## Known issues with using GPU sanitizer
+
+* Red zones must have limited size. It is possible for an invalid access to completely miss a red zone and not be detected.
+
+* Lack of detection or false reports can be caused by the runtime not properly maintaining red zone shadows.
+
+* Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.
+
+* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,25 +5,42 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

 import shutil
+import jinja2
+import os

-from rocm_docs import ROCmDocs
+# Environment to process Jinja templates.
+jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))

+# Jinja templates to render out.
+templates = []

-shutil.copy2('../CONTRIBUTING.md','./contributing.md')
-shutil.copy2('../RELEASE.md','./release.md')
+# Render templates and output files without the last extension.
+# For example: 'install.md.jinja' becomes 'install.md'.
+for template in templates:
+    rendered = jinja_env.get_template(template).render()
+    with open(os.path.splitext(template)[0], 'w') as file:
+        file.write(rendered)
+
+shutil.copy2('../CONTRIBUTING.md','./contribute/index.md')
+shutil.copy2('../RELEASE.md','./about/release-notes.md')
 # Keep capitalization due to similar linking on GitHub's markdown preview.
-shutil.copy2('../CHANGELOG.md','./CHANGELOG.md')
+shutil.copy2('../CHANGELOG.md','./about/CHANGELOG.md')

 latex_engine = "xelatex"
+latex_elements = {
+    "fontpkg": r"""
+\usepackage{tgtermes}
+\usepackage{tgheros}
+\renewcommand\ttdefault{txtt}
+"""
+}

 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
-version = "5.6.0"
-release = "5.6.0"
-
-
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = "6.0.0"
+release = "6.0.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""
@@ -33,66 +50,54 @@ article_pages = [
    {
        "file":"release",
        "os":["linux", "windows"],
-        "date":"2023-07-27"
+        "date":"2024-01-09"
    },

-    {"file":"deploy/linux/index", "os":["linux"]},
-    {"file":"deploy/linux/install_overview", "os":["linux"]},
-    {"file":"deploy/linux/prerequisites", "os":["linux"]},
-    {"file":"deploy/linux/quick_start", "os":["linux"]},
-    {"file":"deploy/linux/install", "os":["linux"]},
-    {"file":"deploy/linux/upgrade", "os":["linux"]},
-    {"file":"deploy/linux/uninstall", "os":["linux"]},
-    {"file":"deploy/linux/package_manager_integration", "os":["linux"]},
-    {"file":"deploy/docker", "os":["linux"]},
-    
-    {"file":"deploy/windows/cli/index", "os":["windows"]},
-    {"file":"deploy/windows/cli/install", "os":["windows"]},
-    {"file":"deploy/windows/cli/uninstall", "os":["windows"]},
-    {"file":"deploy/windows/cli/upgrade", "os":["windows"]},
-    {"file":"deploy/windows/gui/index", "os":["windows"]},
-    {"file":"deploy/windows/gui/install", "os":["windows"]},
-    {"file":"deploy/windows/gui/uninstall", "os":["windows"]},
-    {"file":"deploy/windows/gui/upgrade", "os":["windows"]},
-    {"file":"deploy/windows/index", "os":["windows"]},
-    {"file":"deploy/windows/prerequisites", "os":["windows"]},
-    {"file":"deploy/windows/quick_start", "os":["windows"]},
+    {"file":"install/windows/install-quick", "os":["windows"]},
+    {"file":"install/linux/install-quick", "os":["linux"]},

-    {"file":"release/gpu_os_support", "os":["linux"]},
-    {"file":"release/windows_support", "os":["windows"]},
-    {"file":"release/docker_support_matrix", "os":["linux"]},
-    
-    {"file":"reference/gpu_libraries/communication", "os":["linux"]},
-    {"file":"reference/ai_tools", "os":["linux"]},
-    {"file":"reference/management_tools", "os":["linux"]},
-    {"file":"reference/validation_tools", "os":["linux"]},
-    {"file":"reference/framework_compatibility/framework_compatibility", "os":["linux"]},
-    {"file":"reference/computer_vision", "os":["linux"]},
-    
-    {"file":"how_to/deep_learning_rocm", "os":["linux"]},
-    {"file":"how_to/gpu_aware_mpi", "os":["linux"]},
-    {"file":"how_to/magma_install/magma_install", "os":["linux"]},
-    {"file":"how_to/pytorch_install/pytorch_install", "os":["linux"]},
-    {"file":"how_to/system_debugging", "os":["linux"]},
-    {"file":"how_to/tensorflow_install/tensorflow_install", "os":["linux"]},
+    {"file":"install/linux/install", "os":["linux"]},
+    {"file":"install/linux/install-options", "os":["linux"]},
+    {"file":"install/linux/prerequisites", "os":["linux"]},

-    {"file":"examples/machine_learning", "os":["linux"]},
-    {"file":"examples/inception_casestudy/inception_casestudy", "os":["linux"]},
-    
-    {"file":"understand/file_reorg", "os":["linux"]},
+    {"file":"install/docker", "os":["linux"]},
+    {"file":"install/magma-install", "os":["linux"]},
+    {"file":"install/pytorch-install", "os":["linux"]},
+    {"file":"install/tensorflow-install", "os":["linux"]},

-    {"file":"understand/isv_deployment_win", "os":["windows"]},
+    {"file":"install/windows/install", "os":["windows"]},
+    {"file":"install/windows/prerequisites", "os":["windows"]},
+    {"file":"install/windows/cli/index", "os":["windows"]},
+    {"file":"install/windows/gui/index", "os":["windows"]},
+
+    {"file":"about/compatibility/docker-image-support-matrix", "os":["linux"]},
+    {"file":"about/compatibility/user-kernel-space-compat-matrix", "os":["linux"]},
+
+    {"file":"reference/library-index", "os":["linux"]},
+
+    {"file":"how-to/deep-learning-rocm", "os":["linux"]},
+    {"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
+    {"file":"how-to/system-debugging", "os":["linux"]},
+    {"file":"how-to/tuning-guides", "os":["linux", "windows"]},
+
+    {"file":"rocm-a-z", "os":["linux", "windows"]},
+
+    {"file":"about/release-notes", "os":["linux"]},
 ]

+exclude_patterns = ['temp']
+
 external_toc_path = "./sphinx/_toc.yml"

-docs_core = ROCmDocs("ROCm Documentation Home")
-docs_core.setup()
+extensions = ["rocm_docs"]

 external_projects_current_project = "rocm"

-for sphinx_var in ROCmDocs.SPHINX_VARS:
-    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
+html_theme = "rocm_docs_theme"
+html_theme_options = {"flavor": "rocm-docs-home"}
+
+html_title = "ROCm Documentation"
+
 html_theme_options = {
    "link_main_doc": False
 }
--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -0,0 +1,155 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Building ROCm documentation">
+  <meta name="keywords" content="documentation, Visual Studio Code, GitHub, command line,
+  AMD, ROCm">
+</head>
+
+# Building documentation
+
+You can build our documentation via GitHub (in a pull request) or locally (using the command line or
+Visual Studio (VS) Code.
+
+## GitHub
+
+If you open a pull request on the `develop` branch of a ROCm repository and scroll to the bottom of
+the page, there is a summary panel. Next to the line
+`docs/readthedocs.com:advanced-micro-devices-demo`, there is a `Details` link. If you click this, it takes
+you to the Read the Docs build for your pull request.
+
+![Screenshot of the GitHub documentation build link](../data/contribute/github-docs-build.png)
+
+If you don't see this line, click `Show all checks` to get an itemized view.
+
+## Command line
+
+You can build our documentation via the command line using Python. We use Python 3.8; other
+versions may not support the build.
+
+Use the Python Virtual Environment (`venv`) and run the following commands from the project root:
+
+```sh
+python3 -mvenv .venv
+
+# Windows
+.venv/Scripts/python -m pip install -r docs/sphinx/requirements.txt
+.venv/Scripts/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+
+# Linux
+.venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
+.venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+```
+
+Navigate to `_build/html/index.html` and open this file in a web browser.
+
+## Visual Studio Code
+
+With the help of a few extensions, you can create a productive environment to author and test
+documentation locally using Visual Studio (VS) Code. Follow these steps to configure VS Code:
+
+1. Install the required extensions:
+
+   * Python: `(ms-python.python)`
+   * Live Server: `(ritwickdey.LiveServer)`
+
+2. Add the following entries to `.vscode/settings.json`.
+
+    ```json
+      {
+        "liveServer.settings.root": "/.vscode/build/html",
+        "liveServer.settings.wait": 1000,
+        "python.terminal.activateEnvInCurrentTerminal": true
+      }
+    ```
+
+    * `liveServer.settings.root`: Sets the root of the output website for live previews. Must be changed
+      alongside the `tasks.json` command.
+    * `liveServer.settings.wait`: Tells the live server to wait with the update in order to give Sphinx time to
+      regenerate the site contents and not refresh before the build is complete.
+    * `python.terminal.activateEnvInCurrentTerminal`: Activates the automatic virtual environment, so you
+      can build the site from the integrated terminal.
+
+3. Add the following tasks to `.vscode/tasks.json`.
+
+    ```json
+      {
+        "version": "2.0.0",
+        "tasks": [
+          {
+            "label": "Build Docs",
+            "type": "process",
+            "windows": {
+              "command": "${workspaceFolder}/.venv/Scripts/python.exe"
+            },
+            "command": "${workspaceFolder}/.venv/bin/python3",
+            "args": [
+              "-m",
+              "sphinx",
+              "-j",
+              "auto",
+              "-T",
+              "-b",
+              "html",
+              "-d",
+              "${workspaceFolder}/.vscode/build/doctrees",
+              "-D",
+              "language=en",
+              "${workspaceFolder}/docs",
+              "${workspaceFolder}/.vscode/build/html"
+            ],
+            "problemMatcher": [
+              {
+                "owner": "sphinx",
+                "fileLocation": "absolute",
+                "pattern": {
+                  "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):(\\d+):\\s+(WARNING|ERROR):\\s+(.*)$",
+                  "file": 1,
+                  "line": 2,
+                  "severity": 3,
+                  "message": 4
+                }
+              },
+              {
+              "owner": "sphinx",
+                "fileLocation": "absolute",
+                "pattern": {
+                  "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):{1,2}\\s+(WARNING|ERROR):\\s+(.*)$",
+                  "file": 1,
+                  "severity": 2,
+                  "message": 3
+                }
+              }
+            ],
+            "group": {
+              "kind": "build",
+              "isDefault": true
+            }
+          }
+        ]
+      }
+    ```
+
+    > (Implementation detail: two problem matchers were needed to be defined,
+    > because VS Code doesn't tolerate some problem information being potentially
+    > absent. While a single regex could match all types of errors, if a capture
+    > group remains empty (the line number doesn't show up in all warning/error
+    > messages) but the `pattern` references said empty capture group, VS Code
+    > discards the message completely.)
+
+4. Configure the Python virtual environment (`venv`).
+
+    From the Command Palette, run `Python: Create Environment`. Select `venv` environment and
+    `docs/sphinx/requirements.txt`.
+
+5. Build the docs.
+
+    Launch the default build task using one of the following options:
+
+    * A hotkey (the default is `Ctrl+Shift+B`)
+    * Issuing the `Tasks: Run Build Task` from the Command Palette
+
+6. Open the live preview.
+
+    Navigate to the site output within VS Code: right-click on `.vscode/build/html/index.html` and
+    select `Open with Live Server`. The contents should update on every rebuild without having to
+    refresh the browser.
--- a/docs/contribute/contribute-docs.md
+++ b/docs/contribute/contribute-docs.md
@@ -0,0 +1,229 @@
+# Contributing to ROCm documentation
+
+AMD values and encourages contributions to our code and documentation. If you choose to
+contribute, we encourage you to be polite and respectful. Improving documentation is a long-term
+process, to which we are dedicated.
+
+If you have issues when trying to contribute, refer to the
+[discussions](https://github.com/RadeonOpenCompute/ROCm/discussions) page in our GitHub
+repository.
+
+## Folder structure and naming convention
+
+Our documentation follows the Pitchfork folder structure. Most documentation files are stored in the
+`/docs` folder. Some special files (such as release, contributing, and changelog) are stored in the root
+(`/`) folder.
+
+All images are stored in the `/docs/data` folder. An image's file path mirrors that of the documentation
+file where it is used.
+
+Our naming structure uses kebab case; for example, `my-file-name.rst`.
+
+## Supported formats and syntax
+
+Our documentation includes both Markdown and RST files. We are gradually transitioning existing
+Markdown to RST in order to more effectively meet our documentation needs. When contributing,
+RST is preferred; if you must use Markdown, use GitHub-flavored Markdown.
+
+We use [Sphinx Design](https://sphinx-design.readthedocs.io/en/latest/index.html) syntax and compile
+our API references using [Doxygen](https://www.doxygen.nl/).
+
+The following table shows some common documentation components and the syntax convention we
+use for each:
+
+<table>
+<tr>
+<th>Component</th>
+<th>RST syntax</th>
+</tr>
+<tr>
+<td>Code blocks</td>
+<td>
+
+```rst
+
+.. code-block:: language-name
+
+  My code block.
+
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Cross-referencing internal files</td>
+<td>
+
+```rst
+
+:doc:`Title <../path/to/file/filename>`
+
+```
+
+</td>
+</tr>
+<tr>
+<td>External links</td>
+<td>
+
+```rst
+
+`link name  <URL>`_
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Headings</td>
+<td>
+
+```rst
+
+******************
+Chapter title (H1)
+******************
+
+Section title (H2)
+===============
+
+Subsection title (H3)
+---------------------
+
+Sub-subsection title (H4)
+^^^^^^^^^^^^^^^^^^^^
+
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Images</td>
+<td>
+
+```rst
+
+.. image:: image1.png
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Internal links</td>
+<td>
+
+```rst
+
+1. Add a tag to the section you want to reference:
+
+.. _my-section-tag: section-1
+
+Section 1
+==========
+
+2. Link to your tag:
+
+As shown in :ref:`section-1`.
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Lists</td>
+<td>
+
+```rst
+
+# Ordered (numbered) list item
+
+* Unordered (bulleted) list item
+
+```
+
+</td>
+</tr>
+<tr>
+<tr>
+<td>Math (block)</td>
+<td>
+
+```rst
+
+.. math::
+
+  A = \begin{pmatrix}
+          0.0 & 1.0 & 1.0 & 3.0 \\
+          4.0 & 5.0 & 6.0 & 7.0 \\
+        \end{pmatrix}
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Math (inline)</td>
+<td>
+
+```rst
+
+:math:`2 \times 2 `
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Notes</td>
+<td>
+
+```rst
+
+.. note::
+
+  My note here.
+
+```
+
+</td>
+</tr>
+<tr>
+<td>Tables</td>
+<td>
+
+```rst
+
+.. csv-table::  Optional title here
+  :widths: 30, 70  #optional column widths
+  :header: "entry1 header", "entry2 header"
+
+   "entry1", "entry2"
+
+```
+
+</td>
+</tr>
+</table>
+
+## Language and style
+
+We use the
+[Google developer documentation style guide](https://developers.google.com/style/highlights) to
+guide our content.
+
+Font size and type, page layout, white space control, and other formatting
+details are controlled via
+[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core). If you want to notify us
+of any formatting issues, create a pull request in our
+[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) GitHub repository.
+
+## Building our documentation
+
+<!--  % TODO: Fix the link to be able to work at every files  -->
+To learn how to build our documentation, refer to
+[Building documentation](./building.md).
--- a/docs/contribute/feedback.md
+++ b/docs/contribute/feedback.md
@@ -0,0 +1,33 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Providing feedback for ROCm documentation">
+  <meta name="keywords" content="documentation, pull request, GitHub, AMD, ROCm">
+</head>
+
+# Providing feedback for ROCm documentation
+
+There are four standard ways to provide feedback for this repository.
+
+## Pull request
+
+All contributions to ROCm documentation should arrive via the
+[GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
+targeting the develop branch of the repository. If you are unable to contribute
+via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
+
+## GitHub discussions
+
+To ask questions or view answers to frequently asked questions, refer to
+[GitHub Discussions](https://github.com/RadeonOpenCompute/ROCm/discussions).
+On GitHub Discussions, in addition to asking and answering questions,
+members can share updates, have open-ended conversations,
+and follow along on via public announcements.
+
+## GitHub issue
+
+Issues on existing or absent docs can be filed as
+[GitHub Issues](https://github.com/RadeonOpenCompute/ROCm/issues).
+
+## Email
+
+Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
--- a/docs/contribute/toolchain.md
+++ b/docs/contribute/toolchain.md
@@ -0,0 +1,77 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="ROCm documentation toolchain">
+  <meta name="keywords" content="documentation, toolchain, Sphinx, Doxygen, MyST, AMD, ROCm">
+</head>
+
+# ROCm documentation toolchain
+
+Our documentation relies on several open source toolchains and sites.
+
+## `rocm-docs-core`
+
+[rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) is an AMD-maintained
+project that applies customization for our documentation. This
+project is the tool most ROCm repositories use as part of the documentation
+build. It is also available as a [pip package on PyPI](https://pypi.org/project/rocm-docs-core/).
+
+See the user and developer guides for rocm-docs-core at {doc}`rocm-docs-core documentation<rocm-docs-core:index>`.
+
+## Sphinx
+
+[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator
+originally used for Python. It is now widely used in the open source community.
+Originally, Sphinx supported reStructuredText (RST) based documentation, but
+Markdown support is now available.
+ROCm documentation plans to default to Markdown for new projects.
+Existing projects using RST are under no obligation to convert to Markdown. New
+projects that believe Markdown is not suitable should contact the documentation
+team prior to selecting RST.
+
+## Read the Docs
+
+[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds
+and hosts the HTML documentation generated using Sphinx to our end users.
+
+## Doxygen
+
+[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts
+information from inline code.
+ROCm projects typically use Doxygen for public API documentation unless the
+upstream project uses a different tool.
+
+### Breathe
+
+[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin to integrate Doxygen
+content.
+
+### MyST
+
+[Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended
+flavor of Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and Sphinx.
+It is integrated into ROCm documentation by the Sphinx extension [`myst-parser`](https://myst-parser.readthedocs.io/en/latest/).
+A cheat sheet that showcases how to use the MyST syntax is available over at
+the [Jupyter reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html).
+
+### Sphinx External ToC
+
+[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html)
+is a Sphinx extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
+based on a YAML file that specifies the table of contents.
+It was selected due to its flexibility that allows scripts to operate on the
+YAML file. Please transition to this file for the project's navigation. You can
+see the `_toc.yml.in` file in this repository in the `docs/sphinx` folder for an
+example.
+
+### Sphinx-book-theme
+
+[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme
+that defines the base appearance for ROCm documentation.
+ROCm documentation applies some customization,
+such as a custom header and footer on top of the Sphinx Book Theme.
+
+### Sphinx design
+
+[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that adds design
+functionality.
+ROCm documentation uses Sphinx Design for grids, cards, and synchronized tabs.
--- a/docs/data/understand/deep_learning/amd_logo.png
+++ b/docs/data/understand/deep_learning/amd_logo.png
--- a/docs/data/conceptual/TextClassification-3.png
+++ b/docs/data/conceptual/TextClassification-3.png
--- a/docs/data/conceptual/TextClassification-4.png
+++ b/docs/data/conceptual/TextClassification-4.png
--- a/docs/data/conceptual/TextClassification-5.png
+++ b/docs/data/conceptual/TextClassification-5.png
--- a/docs/data/understand/deep_learning/TextClassification_6.png
+++ b/docs/data/understand/deep_learning/TextClassification_6.png
--- a/docs/data/understand/deep_learning/TextClassification_7.png
+++ b/docs/data/understand/deep_learning/TextClassification_7.png
--- a/docs/data/conceptual/gpu-arch/image001.png
+++ b/docs/data/conceptual/gpu-arch/image001.png
--- a/docs/data/conceptual/gpu-arch/image002.png
+++ b/docs/data/conceptual/gpu-arch/image002.png
--- a/docs/data/conceptual/gpu-arch/image003.png
+++ b/docs/data/conceptual/gpu-arch/image003.png
--- a/docs/data/conceptual/gpu-arch/image004.png
+++ b/docs/data/conceptual/gpu-arch/image004.png
--- a/docs/data/conceptual/gpu-arch/image005.png
+++ b/docs/data/conceptual/gpu-arch/image005.png
--- a/docs/data/conceptual/gpu-arch/image006.png
+++ b/docs/data/conceptual/gpu-arch/image006.png
--- a/docs/data/understand/deep_learning/image.018.png
+++ b/docs/data/understand/deep_learning/image.018.png
--- a/docs/data/understand/deep_learning/inception_v3.png
+++ b/docs/data/understand/deep_learning/inception_v3.png
--- a/docs/data/understand/deep_learning/mnist_1.png
+++ b/docs/data/understand/deep_learning/mnist_1.png
--- a/docs/data/understand/deep_learning/mnist_2.png
+++ b/docs/data/understand/deep_learning/mnist_2.png
--- a/docs/data/understand/deep_learning/mnist_3.png
+++ b/docs/data/understand/deep_learning/mnist_3.png
--- a/docs/data/understand/deep_learning/mnist
+++ b/docs/data/understand/deep_learning/mnist
--- a/docs/data/understand/deep_learning/mnist
+++ b/docs/data/understand/deep_learning/mnist
--- a/docs/data/contribute/github-docs-build.png
+++ b/docs/data/contribute/github-docs-build.png
--- a/docs/data/how-to/gpu-enabled-mpi-1.png
+++ b/docs/data/how-to/gpu-enabled-mpi-1.png
--- a/docs/data/how-to/tuning-guides/tuning001.png
+++ b/docs/data/how-to/tuning-guides/tuning001.png
--- a/docs/data/how-to/tuning-guides/tuning002.png
+++ b/docs/data/how-to/tuning-guides/tuning002.png
--- a/docs/data/how-to/tuning-guides/tuning003.png
+++ b/docs/data/how-to/tuning-guides/tuning003.png
--- a/docs/data/how-to/tuning-guides/tuning004.png
+++ b/docs/data/how-to/tuning-guides/tuning004.png
--- a/docs/data/how-to/tuning-guides/tuning005.png
+++ b/docs/data/how-to/tuning-guides/tuning005.png
--- a/docs/data/how-to/tuning-guides/tuning006.png
+++ b/docs/data/how-to/tuning-guides/tuning006.png
--- a/docs/data/how-to/tuning-guides/tuning008.png
+++ b/docs/data/how-to/tuning-guides/tuning008.png
--- a/docs/data/how-to/tuning-guides/tuning009.png
+++ b/docs/data/how-to/tuning-guides/tuning009.png
--- a/docs/data/how-to/tuning-guides/tuning010.png
+++ b/docs/data/how-to/tuning-guides/tuning010.png
--- a/docs/data/how-to/tuning-guides/tuning011.png
+++ b/docs/data/how-to/tuning-guides/tuning011.png
--- a/docs/data/how-to/tuning-guides/tuning012.png
+++ b/docs/data/how-to/tuning-guides/tuning012.png
--- a/docs/data/how-to/tuning-guides/tuning013.png
+++ b/docs/data/how-to/tuning-guides/tuning013.png
--- a/docs/data/how-to/tuning-guides/tuning014.png
+++ b/docs/data/how-to/tuning-guides/tuning014.png
--- a/docs/data/how-to/tuning-guides/tuning015.png
+++ b/docs/data/how-to/tuning-guides/tuning015.png
--- a/docs/data/how-to/tuning-guides/tuning016.png
+++ b/docs/data/how-to/tuning-guides/tuning016.png
--- a/docs/data/install/linux/linux001.png
+++ b/docs/data/install/linux/linux001.png
--- a/docs/data/install/linux/linux002.png
+++ b/docs/data/install/linux/linux002.png
--- a/docs/data/install/linux/linux003.png
+++ b/docs/data/install/linux/linux003.png
--- a/docs/data/install/linux/linux004.png
+++ b/docs/data/install/linux/linux004.png
--- a/docs/data/install/magma-install/magma005.png
+++ b/docs/data/install/magma-install/magma005.png
--- a/docs/data/install/magma-install/magma006.png
+++ b/docs/data/install/magma-install/magma006.png
--- a/docs/data/install/windows/000-settings-dark.png
+++ b/docs/data/install/windows/000-settings-dark.png
--- a/docs/data/install/windows/000-settings-light.png
+++ b/docs/data/install/windows/000-settings-light.png
--- a/docs/data/install/windows/000-setup-icon.png
+++ b/docs/data/install/windows/000-setup-icon.png
--- a/docs/data/install/windows/001-about-dark.png
+++ b/docs/data/install/windows/001-about-dark.png
--- a/docs/data/install/windows/001-about-light.png
+++ b/docs/data/install/windows/001-about-light.png
--- a/docs/data/install/windows/001-uac-dark.png
+++ b/docs/data/install/windows/001-uac-dark.png
--- a/docs/data/install/windows/001-uac-light.png
+++ b/docs/data/install/windows/001-uac-light.png
--- a/docs/data/install/windows/002-initializing.png
+++ b/docs/data/install/windows/002-initializing.png
--- a/docs/data/install/windows/003-detecting-system-config.png
+++ b/docs/data/install/windows/003-detecting-system-config.png
--- a/docs/data/install/windows/004-installer-window.png
+++ b/docs/data/install/windows/004-installer-window.png
--- a/docs/data/install/windows/012-install-progress.png
+++ b/docs/data/install/windows/012-install-progress.png
--- a/docs/data/install/windows/013-install-complete.png
+++ b/docs/data/install/windows/013-install-complete.png
--- a/docs/data/install/windows/014-uninstall-dark.png
+++ b/docs/data/install/windows/014-uninstall-dark.png
--- a/docs/data/install/windows/014-uninstall-light.png
+++ b/docs/data/install/windows/014-uninstall-light.png
--- a/docs/data/reference/openmp/openmp-toolchain.svg
+++ b/docs/data/reference/openmp/openmp-toolchain.svg
--- a/docs/data/unused-images/_005-deselect-all-windows.png
+++ b/docs/data/unused-images/_005-deselect-all-windows.png
--- a/docs/data/unused-images/_006-component-options-sdk-core-windows.png
+++ b/docs/data/unused-images/_006-component-options-sdk-core-windows.png
--- a/docs/data/unused-images/_007-component-options-libraries-windows.png
+++ b/docs/data/unused-images/_007-component-options-libraries-windows.png
--- a/Show More
+++ b/Show More