link updates (#2861)

2026-04-05 03:01:17 -04:00 · 2024-02-08 17:24:12 -07:00
parent 82ac21fac5
commit a44f6d1efc
10 changed files with 477 additions and 392 deletions
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -1,29 +1,378 @@
+AAC
 ABI
-activations
-addr
-AddressSanitizer
-AlexNet
-alloc
-allocator
-allocators
 ALU
 AMD
 AMDGPU
-amdgpu
 AMDGPUs
 AMDMIGraphX
 AMI
 AOCC
 AOMP
-api
 APIC
 APIs
-Arb
-ASan
 ASIC
 ASICs
+ASan
 ASm
 ATI
+AddressSanitizer
+AlexNet
+Arb
+BLAS
+BMC
+BitCode
+Blit
+Bluefield
+CCD
+CDNA
+CIFAR
+CLI
+CLion
+CMake
+CMakeLists
+CMakePackage
+CP
+CPC
+CPF
+CPP
+CPU
+CPUs
+CSC
+CSE
+CSV
+CSn
+CTests
+CU
+CUDA
+CUs
+CXX
+Cavium
+CentOS
+ChatGPT
+CoRR
+Codespaces
+Commitizen
+CommonMark
+Concretized
+Conda
+ConnectX
+DGEMM
+DKMS
+DL
+DMA
+DNN
+DNNL
+DPM
+DRI
+DW
+DWORD
+Dask
+DataFrame
+DataLoader
+DataParallel
+DeepSpeed
+Dependabot
+DevCap
+Dockerfile
+Doxygen
+ELMo
+ENDPGM
+EPYC
+ESXi
+FFT
+FFTs
+FFmpeg
+FHS
+FMA
+FP
+Filesystem
+Flang
+Fortran
+Fuyu
+GALB
+GCD
+GCDs
+GCN
+GDB
+GDDR
+GDR
+GDS
+GEMM
+GEMMs
+GFortran
+GIM
+GL
+GLXT
+GMI
+GPG
+GPR
+GPT
+GPU
+GPU's
+GPUs
+GRBM
+GenAI
+GenZ
+GitHub
+Gitpod
+HBM
+HCA
+HIPCC
+HIPExtension
+HIPIFY
+HPC
+HPCG
+HPE
+HPL
+HSA
+HWE
+Haswell
+Higgs
+Hyperparameters
+ICV
+IDE
+IDEs
+IMDb
+IOMMU
+IOP
+IOPM
+IOV
+IRQ
+ISA
+ISV
+ISVs
+ImageNet
+InfiniBand
+Inlines
+IntelliSense
+Intersphinx
+Intra
+Ioffe
+JSON
+Jupyter
+KFD
+KVM
+Keras
+Khronos
+LAPACK
+LCLK
+LDS
+LLM
+LLMs
+LLVM
+LM
+LSAN
+LTS
+LoRA
+MEM
+MERCHANTABILITY
+MFMA
+MIGraphX
+MIOpen
+MIOpenGEMM
+MIVisionX
+MLM
+MMA
+MMIO
+MMIOH
+MNIST
+MPI
+MSVC
+MVAPICH
+MVFFR
+Makefile
+Makefiles
+Matplotlib
+Megatron
+Mellanox
+Mellanox's
+Meta's
+MirroredStrategy
+Multicore
+Multithreaded
+MyEnvironment
+MyST
+NBIO
+NBIOs
+NIC
+NICs
+NLI
+NLP
+NPS
+NSP
+NUMA
+NVCC
+NVIDIA
+NVPTX
+Nano
+Navi
+Noncoherently
+NousResearch's
+NumPy
+OAM
+OAMs
+OCP
+OEM
+OFED
+OMP
+OMPI
+OMPT
+OMPX
+ONNX
+OSS
+OSU
+OpenCL
+OpenCV
+OpenFabrics
+OpenGL
+OpenMP
+OpenSSL
+OpenVX
+PCI
+PCIe
+PEFT
+PIL
+PILImage
+PRNG
+PRs
+PaLM
+Pageable
+PeerDirect
+Perfetto
+PipelineParallel
+PnP
+PowerShell
+PyPi
+PyTorch
+Qcycles
+RAII
+RCCL
+RDC
+RDMA
+RDNA
+RHEL
+ROC
+ROCProfiler
+ROCTracer
+ROCclr
+ROCdbgapi
+ROCgdb
+ROCk
+ROCm
+ROCmCC
+ROCmSoftwarePlatform
+ROCmValidationSuite
+ROCr
+RST
+RW
+Radeon
+RelWithDebInfo
+Req
+Rickle
+RoCE
+Ryzen
+SALU
+SBIOS
+SCA
+SDK
+SDMA
+SDRAM
+SENDMSG
+SGPR
+SGPRs
+SHA
+SIGQUIT
+SIMD
+SIMDs
+SKU
+SKUs
+SLES
+SMEM
+SMI
+SMT
+SPI
+SQs
+SRAM
+SRAMECC
+SVD
+SWE
+SerDes
+Shlens
+Skylake
+Softmax
+Spack
+Supermicro
+Szegedy
+TCA
+TCC
+TCI
+TCIU
+TCP
+TCR
+TFLOPS
+TPU
+TPUs
+TensorBoard
+TensorFlow
+TensorParallel
+ToC
+TorchAudio
+TorchMIGraphX
+TorchScript
+TorchServe
+TorchVision
+TransferBench
+TrapStatus
+UAC
+UC
+UCC
+UCX
+UIF
+USM
+UTCL
+UTIL
+Uncached
+Unhandled
+VALU
+VBIOS
+VGPR
+VGPRs
+VM
+VMEM
+VMWare
+VRAM
+VSIX
+VSkipped
+Vanhoucke
+Vulkan
+WGP
+WX
+WikiText
+Wojna
+Workgroups
+Writebacks
+XGBoost
+XGBoost's
+XGMI
+XT
+XTX
+Xeon
+Xilinx
+Xnack
+Xteam
+YAML
+YML
+YModel
+ZeRO
+ZenDNN
+accuracies
+activations
+addr
+alloc
+allocator
+allocators
+amdgpu
+api
 atmi
 atomics
 autogenerated
@@ -33,68 +382,40 @@ backend
 backends
 benchmarking
 bilinear
-BitCode
-BLAS
-Blit
+bitsandbytes
 blit
-BMC
+boson
+bosons
 buildable
 bursty
 bzip
 cacheable
-CCD
 cd
-CDNA
-CentOS
+centos
 centric
 changelog
 chiplet
-CIFAR
-CLI
-CLion
-CMake
 cmake
-CMakeLists
-CMakePackage
 cmd
 coalescable
 codename
-Codespaces
+collater
 comgr
-Commitizen
-CommonMark
 completers
 composable
 concretization
-Concretized
-Conda
 config
 conformant
 convolutional
 convolves
-CoRR
-CP
-CPC
-CPF
-CPP
-CPU
-CPUs
-CSC
-CSE
-CSn
+cpp
 csn
-CSV
-CTests
-CU
 cuBLAS
-CUDA
 cuFFT
 cuLIB
 cuRAND
-CUs
 cuSOLVER
 cuSPARSE
-CXX
 dataset
 datasets
 dataspace
@@ -103,150 +424,79 @@ datatypes
 dbgapi
 de
 deallocation
+denoise
+denoised
+denoises
 denormalize
-Dependabot
 deserializers
 detections
 dev
-DevCap
 devicelibs
 devsel
-DGEMM
+dimensionality
 disambiguates
 distro
-DL
-DMA
-DNN
-DNNL
-Dockerfile
-Doxygen
-DPM
-DRI
-DW
-DWORD
 el
+embeddings
 enablement
 endpgm
 env
 epilog
-EPYC
-ESXi
+etcetera
 ethernet
 exascale
 executables
 ffmpeg
-FFT
-FFTs
-FHS
 filesystem
-Filesystem
-Flang
-FMA
-Fortran
 fortran
-FP
 galb
 gcc
-GCD
-GCDs
-GCN
-GDB
 gdb
-GDDR
-GDR
-GDS
-GEMM
-GEMMs
-GenZ
 gfortran
 gfx
-GIM
+githooks
 github
-Gitpod
-GL
-GLXT
-GMI
 gnupg
-GPG
-GPR
-GPU
-GPUs
 grayscale
-GRBM
 gzip
-Haswell
-HBM
-HCA
 heterogenous
-hipamd
 hipBLAS
-hipblas
 hipBLASLt
-HIPCC
 hipCUB
-hipcub
-HIPExtension
 hipFFT
-hipfft
-hipfort
-HIPIFY
-hipify
 hipLIB
 hipRAND
 hipSOLVER
-hipsolver
 hipSPARSE
-hipsparse
 hipSPARSELt
 hipTensor
-HPC
-HPCG
-HPE
-HPL
-HSA
+hipamd
+hipblas
+hipcub
+hipfft
+hipfort
+hipify
+hipsolver
+hipsparse
+hpp
 hsa
 hsakmt
-HWE
+hyperparameter
 ib_core
-ICV
-IDE
-IDEs
-ImageNet
-IMDB
 inband
 incrementing
 inferencing
-InfiniBand
 inflight
 init
-Inlines
+initializer
 inlining
 installable
-IntelliSense
 interprocedural
-Intersphinx
 intra
 invariants
 invocating
-Ioffe
-IOMMU
-IOP
-IOPM
-IOV
 ipo
-IRQ
-ISA
-ISV
-ISVs
-JSON
-Jupyter
 kdb
-KFD
-Khronos
-KVM
-LAPACK
-LCLK
-LDS
 libfabric
 libjpeg
 libs
@@ -254,96 +504,37 @@ linearized
 linter
 linux
 llvm
-LLVM
 localscratch
 logits
 lossy
-LSAN
-LTS
-Makefile
-Makefiles
+macOS
 matchers
-Matplotlib
-Mellanox's
-MEM
-MERCHANTABILITY
-MFMA
 microarchitecture
-MIGraphX
 migraphx
-MIOpen
 miopen
-MIOpenGEMM
 miopengemm
-MIVisionX
 mivisionx
 mkdir
 mlirmiopen
-MMA
-MMIO
-MMIOH
-MNIST
-MPI
-MSVC
 mtypes
-Multicore
-Multithreaded
-MVAPICH
 mvffr
-MyEnvironment
-MyST
 namespace
 namespaces
-Nano
-Navi
-NBIO
-NBIOs
-NIC
-NICs
-Noncoherently
-NPS
-NUMA
-NumPy
 numref
-NVCC
-NVPTX
-OAM
-OAMs
 ocl
-OCP
-OEM
-OFED
-OMP
-OMPT
-OMPX
-ONNX
-OpenCL
 opencl
 opencv
-OpenFabrics
-OpenGL
-OpenMP
 openmp
 openssl
-OpenVX
 optimizers
 os
-OSS
-OSU
-Pageable
 pageable
+parallelization
+parameterization
 passthrough
-PCI
-PCIe
-PeerDirect
 perfcounter
-Perfetto
 performant
 perl
-PIL
-PILImage
-PowerShell
-PnP
 pragma
 pre
 prebuilt
@@ -351,125 +542,69 @@ precompiled
 prefetch
 prefetchable
 preprocess
+preprocessed
 preprocessing
-preq
 prequantized
 prerequisites
-PRNG
 profiler
 protobuf
-PRs
 pseudorandom
 py
-PyPi
-PyTorch
-Qcycles
 quasirandom
 queueing
-Radeon
-RadeonOpenCompute
-RCCL
 rccl
-RDC
 rdc
-RDMA
-RDNA
+reStructuredText
 reformats
-RelWithDebInfo
 repos
-Req
+representativeness
 req
 resampling
-RST
-reStructuredText
-RHEL
-Rickle
+rescaling
+reusability
 roadmap
 roc
-ROC
-RoCE
 rocAL
 rocALUTION
-rocalution
 rocBLAS
+rocFFT
+rocLIB
+rocMLIR
+rocPRIM
+rocRAND
+rocSOLVER
+rocSPARSE
+rocThrust
+rocWMMA
+rocalution
 rocblas
 rocclr
-ROCdbgapi
-rocFFT
 rocfft
-ROCgdb
-ROCk
-rocLIB
 rocm
-ROCm
-ROCmCC
 rocminfo
-rocMLIR
-ROCmSoftwarePlatform
-ROCmValidationSuite
-rocPRIM
 rocprim
 rocprof
-ROCProfiler
 rocprofiler
-ROCr
 rocr
-rocRAND
 rocrand
-rocSOLVER
 rocsolver
-rocSPARSE
 rocsparse
-roct
-rocThrust
 rocthrust
-ROCTracer
 roctracer
-rocWMMA
-RST
 runtime
 runtimes
-RW
-Ryzen
-SALU
-SBIOS
-SCA
-scalability
-SDK
-SDMA
-SDRAM
-SENDMSG
-sendmsg
-SENDMSG
-sendmsg
-SerDes
-serializers
-SGPR
-SGPRs
-SHA
-shader
-Shlens
-sigmoid
-SIGQUIT
-SIMD
-SIMDs
-SKU
-SKUs
-skylake
 sL
-SLES
+scalability
+scalable
+sendmsg
+serializers
+shader
+sharding
+sigmoid
 sm
-SMEM
-SMI
 smi
-SMT
 softmax
-Spack
 spack
-SPI
-SQs
-SRAM
-SRAMECC
 src
 stochastically
 strided
@@ -478,49 +613,23 @@ subexpression
 subfolder
 subfolders
 supercomputing
-Supermicro
-SWE
-Szegedy
-tagram
-TCA
-TCC
-TCI
-TCIU
-TCP
-TCR
-TensorBoard
-TensorFlow
-TFLOPS
-tg
 th
-tmp
-ToC
+tokenization
 tokenize
+tokenized
+tokenizer
+tokenizes
 toolchain
 toolchains
 toolset
 toolsets
-TorchAudio
-TorchMIGraphX
-TorchScript
-TorchServe
-TorchVision
 torchvision
+tqdm
 tracebacks
-TransferBench
-TrapStatus
 txt
-UAC
 uarch
-ubuntu
-UC
-UCC
-UCX
-UIF
-Uncached
 uncached
 uncorrectable
-Unhandled
 uninstallation
 unsqueeze
 unstacking
@@ -532,9 +641,8 @@ USM
 UTCL
 UTIL
 utils
-VALU
-Vanhoucke
-VBIOS
+vL
+variational
 vdi
 vectorizable
 vectorization
@@ -542,44 +650,20 @@ vectorize
 vectorized
 vectorizer
 vectorizes
-VGPR
-VGPRs
 vjxb
-vL
-VM
-VMEM
-VMWare
-VRAM
-VSIX
-VSkipped
-Vulkan
 walkthrough
 walkthroughs
 wavefront
 wavefronts
-WGP
 whitespaces
-Wojna
 workgroup
-Workgroups
 workgroups
 writeback
-Writebacks
 writebacks
 wrreq
-WX
 wzo
-Xeon
-XGMI
-Xnack
-XT
-Xteam
-XTX
+xargs
 xz
-YAML
 yaml
-YML
-YModel
 ysvmadyb
-ZenDNN
 zypper
--- a/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
+++ b/docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
@@ -63,15 +63,14 @@ There are also a number of papers which talk about these new capabilities:

  * `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
  * `PCI express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
-  * `Intel PCIe Generation 3 Hotchips Paper <https://www.hotchips.org/wp-content/uploads/hc_archives/hc21/1_sun/HC21.23.1.SystemInterconnectTutorial-Epub/HC21.23.131.Ajanovic-Intel-PCIeGen3.pdf>`_
  * `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
-
-Other I/O devices with PCIe atomics support
-
-  * `Mellanox ConnectX-5 InfiniBand Card <http://www.mellanox.com/related-docs/prod_adapter_cards/PB_ConnectX-5_VPI_Card.pdf>`_
-  * `Cray Aries Interconnect <http://www.hoti.org/hoti20/slides/Bob_Alverson.pdf>`_
  * `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
-  * `Xilinx 7 Series Devices <https://docs.xilinx.com/v/u/1nfXeFNnGpA0ywyykvWHWQ>`_
+
+Other I/O devices with PCIe atomics support:
+
+  * Mellanox ConnectX-5 InfiniBand Card
+  * Cray Aries Interconnect
+  * Xilinx 7 Series Devices

 Future bus technology with richer I/O atomics operation Support

@@ -80,8 +79,8 @@ Future bus technology with richer I/O atomics operation Support
 New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPUs
 with PCIe Generation 3.0 support.

-  * `Mellanox Bluefield SOC <https://docs.nvidia.com/networking/display/BlueFieldSWv25111213/BlueField+Software+Overview>`_
-  * `Cavium Thunder X2 <https://en.wikichip.org/wiki/cavium/thunderx2>`_
+  * Mellanox Bluefield SOC
+  * Cavium Thunder X2

 In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU
 originates two writes to two different targets:
--- a/docs/conceptual/ai-pytorch-inception.md
+++ b/docs/conceptual/ai-pytorch-inception.md
@@ -22,6 +22,7 @@ Training occurs in multiple phases for every batch of training data. the followi
 :::{table} Types of Training Phases
 :name: training-phases
 :widths: auto
+
 | Types of Phases   |     |
 | ----------------- | --- |
 | Forward Pass      | The input features are fed into the model, whose parameters may be randomly initialized initially. Activations (outputs) of each layer are retained during this pass to help in the loss gradient computation during the backward pass. |
@@ -35,6 +36,7 @@ Training is different from inference, particularly from the hardware perspective
 :::{table} Training vs. Inference
 :name: training-inference
 :widths: auto
+
 | Training | Inference |
 | ----------- | ----------- |
 | Training is measured in hours/days. | The inference is measured in minutes. |
@@ -876,7 +878,7 @@ To understand the code step by step, follow these steps:
        thisplot[true_label].set_color('blue')
        ```

-    9. With the model trained, you can use it to make predictions about some images. Review the 0-th image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.
+    9. With the model trained, you can use it to make predictions about some images. Review the 0<sup>th</sup> image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.

        ```py
        i = 0
--- a/docs/conceptual/gpu-arch/mi200-performance-counters.md
+++ b/docs/conceptual/gpu-arch/mi200-performance-counters.md
@@ -288,9 +288,9 @@ The vector L1 cache subsystem counters are further classified into Texture Addre
 | `TCP_GATE_EN2[n]`                        | Cycles | Number of cycles vL1D core clocks are turned on. Value range for n: [0-15].  |
 | `TCP_TD_TCP_STALL_CYCLES[n]`             | Cycles | Number of cycles TD stalls vL1D. Value range for n: [0-15].                           |
 | `TCP_TCR_TCP_STALL_CYCLES[n]`            | Cycles | Number of cycles TCR stalls vL1D. Value range for n: [0-15].                           |
-| `TCP_READ_TAGCONFLICT_STALL_CYCLES[n]`   | Cycles | Number of cycles tagram conflict stalls on a read. Value range for n: [0-15].          |
-| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]`  | Cycles | Number of cycles tagram conflict stalls on a write. Value range for n: [0-15].         |
-| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tagram conflict stalls on an atomic. Value range for n: [0-15].       |
+| `TCP_READ_TAGCONFLICT_STALL_CYCLES[n]`   | Cycles | Number of cycles tag RAM conflict stalls on a read. Value range for n: [0-15].          |
+| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]`  | Cycles | Number of cycles tag RAM conflict stalls on a write. Value range for n: [0-15].         |
+| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tag RAM conflict stalls on an atomic. Value range for n: [0-15].       |
 | `TCP_PENDING_STALL_CYCLES[n]`            | Cycles | Number of cycles vL1D cache is stalled due to data pending from L2 Cache. Value range for n: [0-15]. |
 | `TCP_TCP_TA_DATA_STALL_CYCLES` | Cycles | Number of cycles TCP stalls TA data interface. |
 | `TCP_TA_TCP_STATE_READ[n]`               | Req    | Number of state reads. Value range for n: [0-15].    |
@@ -454,7 +454,7 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
 | `TCC_NORMAL_WRITEBACK_sum` | Total number of writebacks due to requests that are not writeback requests, over all TCC instances. |
 | `TCC_NORMAL_EVICT_sum` | Total number of evictions due to requests that are not invalidate or probe requests, over all TCC instances. |
 | `TCC_PROBE_sum` | Total number of probe requests over all TCC instances. |
-| `TCC_PROBE_ALL_sum` | Total number of external probe requests with EA_TCC_preq_all== 1, over all TCC instances. |
+| `TCC_PROBE_ALL_sum` | Total number of external probe requests with `EA_TCC_preq_all== 1`, over all TCC instances. |
 | `TCC_READ_sum` | Total number of L2 cache read requests (including compressed reads but not metadata reads) over all TCC instances. |
 | `TCC_REQ_sum` | Total number of all types of L2 cache requests over all TCC instances. |
 | `TCC_RW_REQ_sum` | Total number of RW requests over all TCC instances. |
@@ -465,11 +465,11 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
 | `TCC_WRITE_sum` | Total number of L2 cache write requests over all TCC instances. |
 | `TCC_WRITEBACK_sum` | Total number of lines written back to the main memory including writebacks of dirty lines and uncached write/atomic requests, over all TCC instances. |
 | `TCC_WRREQ_STALL_max` | Maximum number of cycles a write request is stalled, over all TCC instances. |
-| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on an atomic, over all TCP instances. |
+| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on an atomic, over all TCP instances. |
 | `TCP_GATE_EN1_sum` | Total number of cycles vL1D interface clocks are turned on, over all TCP instances. |
 | `TCP_GATE_EN2_sum` | Total number of cycles vL1D core clocks are turned on, over all TCP instances. |
 | `TCP_PENDING_STALL_CYCLES_sum` | Total number of cycles vL1D cache is stalled due to data pending from L2 Cache, over all TCP instances. |
-| `TCP_READ_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a read, over all TCP instances. |
+| `TCP_READ_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on a read, over all TCP instances. |
 | `TCP_TA_TCP_STATE_READ_sum` | Total number of state reads by all TCP instances. |
 | `TCP_TCC_ATOMIC_WITH_RET_REQ_sum` | Total number of atomic requests to L2 cache with return, over all TCP instances. |
 | `TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum` | Total number of atomic requests to L2 cache without return, over all TCP instances. |
@@ -504,7 +504,7 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
 | `TCP_UTCL1_TRANSLATION_MISS_sum` | Total number of UTCL1 translation misses by all TCP instances. |
 | `TCP_UTCL1_TRANSLATION_HIT_sum` | Total number of UTCL1 translation hits by all TCP instances. |
 | `TCP_VOLATILE_sum` | Total number of L1 volatile pixels/buffers from TA, over all TCP instances. |
-| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a write, over all TCP instances. |
+| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on a write, over all TCP instances. |
 | `TD_ATOMIC_WAVEFRONT_sum` | Total number of atomic wavefront instructions, over all TD instances. |
 | `TD_COALESCABLE_WAVEFRONT_sum` | Total number of coalescable wavefronts according to TA, over all TD instances. |
 | `TD_LOAD_WAVEFRONT_sum` | Total number of wavefront instructions (read/write/atomic), over all TD instances. |
--- a/docs/data/banner-text.xcf
+++ b/docs/data/banner-text.xcf
--- a/docs/data/how-to/magma005.png
+++ b/docs/data/how-to/magma005.png
--- a/docs/how-to/deep-learning-rocm.md
+++ b/docs/how-to/deep-learning-rocm.md
@@ -13,7 +13,7 @@ the sequential flow for the use of each framework. Refer to the ROCm Compatible
 Frameworks Release Notes for each framework's most current release notes at
 {doc}`Third-party support<rocm-install-on-linux:reference/3rd-party-support-matrix>`.

-![ROCm Compatible Frameworks Flowchart](../data/install/magma-install/magma005.png "ROCm Compatible Frameworks")
+![ROCm Compatible Frameworks Flowchart](../data/how-to/magma005.png "ROCm Compatible Frameworks")

 ## Frameworks installation

--- a/docs/how-to/tuning-guides/w6000-v620.md
+++ b/docs/how-to/tuning-guides/w6000-v620.md
@@ -122,8 +122,7 @@ sudo reboot
 ```

 Install the GPU-IOV Module (GIM, where IOV is I/O Virtualization) driver and
-follow the steps below. To obtain the GIM driver, write to us
-[here](mailto:CloudGPUsupport@amd.com):
+follow the steps below.z

 ```shell
 sudo dpkg -i <gim_driver>
@@ -167,6 +166,4 @@ First, assign GPU virtual function (VF) to VM using the following steps.
 Then start the VM.

 Finally install ROCm on the virtual machine (VM). For detailed instructions,
-refer to the {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`. For any
-issue encountered during installation, write to us
-[here](mailto:CloudGPUsupport@amd.com).
+refer to the {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`.
--- a/docs/what-is-rocm.md
+++ b/docs/what-is-rocm.md
@@ -37,55 +37,58 @@ ROCm consists of the following drivers, development tools, and APIs.
 | Project | Description |
 | :---------------- | :------------ |
 | [AMD Compute Language Runtimes (CLR)](https://github.com/ROCm-Developer-Tools/clr) | Contains source code for AMD's compute languages runtimes: {doc}`HIP <hip:index>` and OpenCL |
+| {doc}`AMD SMI <amdsmi:index>` | A C library for Linux that provides a user space interface for applications to monitor and control AMD devices |
 | [AOMP](https://github.com/ROCm-Developer-Tools/aomp/) | A scripted build of [LLVM](https://github.com/RadeonOpenCompute/llvm-project) and supporting software |
 | [Asynchronous Task and Memory Interface (ATMI)](https://github.com/RadeonOpenCompute/atmi/) | A runtime framework for efficient task management in heterogeneous CPU-GPU systems |
-| [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) | A library that aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures |
+| {doc}`Composable Kernel <composable_kernel:index>` | A library that aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures |
 | [Flang](https://github.com/ROCm-Developer-Tools/flang/) | An out-of-tree Fortran compiler targeting LLVM |
 | [Half-precision floating point library (half)](https://github.com/ROCmSoftwarePlatform/half/) | A C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type along with corresponding arithmetic operators, type conversions, and common mathematical functions |
 | {doc}`HIP <hip:index>` | AMD’s GPU programming language extension and the GPU runtime |
-| [hipBLAS](https://rocm.docs.amd.com/projects/hipBLAS/en/latest/) | A BLAS-marshaling library that supports [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/) and cuBLAS backends |
-| [HIPCC](https://rocm.docs.amd.com/projects/HIPCC/en/latest/) | A compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure |
-| [hipCUB](https://rocm.docs.amd.com/projects/hipCUB/en/latest/) | A thin header-only wrapper library on top of [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) or CUB that allows project porting using the CUB library to the HIP layer |
-| [hipFFT](https://rocm.docs.amd.com/projects/hipFFT/en/latest/) | A fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends |
-| [hipfort](https://rocm.docs.amd.com/projects/hipfort/en/latest/) | A Fortran interface library for accessing GPU Kernels |
+| {doc}`hipBLAS <hipblas:index>` | A BLAS-marshaling library that supports [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/) and cuBLAS backends |
+| {doc}`hipBLASLt <hipblaslt:index>` | A library that provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library |
+| [hipCC](https://github.com/ROCm/HIPCC) | A compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure |
+| {doc}`hipCUB <hipcub:index>` | A thin header-only wrapper library on top of [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) or CUB that allows project porting using the CUB library to the HIP layer |
+| {doc}`hipFFT <hipfft:index>` | A fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends |
+| {doc}`hipfort <hipfort:index>` | A Fortran interface library for accessing GPU Kernels |
 | {doc}`HIPIFY <hipify:index>` | A set of tools for translating CUDA source code into portable HIP C++ |
-| [hipify-clang](https://rocm.docs.amd.com/projects/HIPIFY/en/latest/hipify-clang.html) | A Clang-based tool for translating CUDA sources into HIP sources |
-| [hipify-perl](https://rocm.docs.amd.com/projects/HIPIFY/en/latest/hipify-perl.html) | An autogenerated, perl-based script that translates CUDA source code into portable HIP C++ |
-| [hipSOLVER](https://rocm.docs.amd.com/projects/hipSOLVER/en/latest/) | An LAPACK-marshalling library that supports [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) and cuSOLVER backends |
-| [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/)  | A SPARSE-marshalling library that supports [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) and cuSPARSE backends |
-| [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/index.html) | AMD's C++ library for accelerating tensor primitives based on the composable kernel library |
+| {doc}`hipRAND <hiprand:index>` | A wrapper library to easily port CUDA applications that use the cuRAND library into the HIP layer |
+| {doc}`hipSOLVER <hipsolver:index>` | An LAPACK-marshalling library that supports [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) and cuSOLVER backends |
+| {doc}`hipSPARSE <hipsparse:index>` | A SPARSE-marshalling library that supports [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) and cuSPARSE backends |
+| {doc}`hipSPARSELt <hipsparselt:index>` | A SPARSE-marshalling library with multiple supported backends |
+| {doc}`hipTensor <hiptensor:index>` | AMD's C++ library for accelerating tensor primitives based on the composable kernel library |
 | [LLVM](https://github.com/RadeonOpenCompute/llvm-project) | A toolkit for the construction of highly optimized compilers, optimizers, and run-time environments |
-| [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/) | A graph inference engine that accelerates machine learning model inference |
-| [MIOpen](https://rocm.docs.amd.com/projects/MIOpen/en/latest/) | An open source deep-learning library |
+| {doc}`MIGraphX <amdmigraphx:index>` | A graph inference engine that accelerates machine learning model inference |
+| {doc}`MIOpen <miopen:index>` | An open source deep-learning library |
 | [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) | An OpenCL general matrix multiplication (GEMM) API and kernel generator |
-| [MIOpenTensile](https://github.com/ROCmSoftwarePlatform/MIOpenTensile) | Provides host-callable interfaces to Tensile library |
-| [MIVisionX](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html) | A set of comprehensive computer vision and machine learning libraries, utilities, and applications |
+| {doc}`MIVisionX <mivisionx:doxygen/html/index>` | A set of comprehensive computer vision and machine learning libraries, utilities, and applications |
 | [Radeon Compute Profiler (RCP)](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | A performance analysis tool that gathers data from the API run-time and GPU for OpenCL and ROCm/HSA applications |
-| [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/) | A standalone library that provides multi-GPU and multi-node collective communication primitives |
-| [rocAL](https://rocm.docs.amd.com/projects/rocAL/en/latest/doxygen/html/index.html) | An augmentation library designed to decode and process images and videos |
-| [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/) | A sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains |
+| {doc}`RCCL <rccl:index>` | A standalone library that provides multi-GPU and multi-node collective communication primitives |
+| {doc}`rocAL <rocal:index>` | An augmentation library designed to decode and process images and videos |
+| {doc}`rocALUTION <rocalution:index>` | A sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains |
 | [RocBandwidthTest](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/) | Captures the performance characteristics of buffer copying and kernel read/write operations |
-| [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/)| A BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains |
-| [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/) | A software library for computing fast Fourier transforms (FFTs) written in HIP |
+| {doc}`rocBLAS <rocblas:index>` | A BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains |
+| {doc}`rocDecode <rocdecode:index>` | A high performance video decode SDK for AMD GPUs |
+| {doc}`rocFFT <rocfft:index>` | A software library for computing fast Fourier transforms (FFTs) written in HIP |
 | [ROCK-Kernel-Driver](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/) | An AMDGPU Driver with KFD that is used by ROCm |
-| [ROCmCC](https://rocm.docs.amd.com/en/latest/reference/rocmcc/rocmcc.html) | A Clang/LLVM-based compiler |
+| [ROCmCC](./reference/rocmcc.md) | A Clang/LLVM-based compiler |
 | [ROCm cmake](https://github.com/RadeonOpenCompute/rocm-cmake) | A collection of CMake modules for common build and development tasks |
-| [ROCm Data Center Tool](https://rocm.docs.amd.com/projects/rdc/en/latest/) | Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments |
+| {doc}`ROCm Data Center Tool <rdc:index>` | Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments |
 | [ROCm Debug Agent Library (ROCdebug-agent)](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/) | A library that can print the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running |
-| [ROCm Debugger (ROCgdb)](https://rocm.docs.amd.com/projects/ROCgdb/en/latest/) | A source-level debugger for Linux, based on the GNU Debugger (GDB) |
-| [ROCdbgapi](https://rocm.docs.amd.com/projects/ROCdbgapi/en/latest/) | The ROCm debugger API library |
+| {doc}`ROCm debugger (ROCgdb) <rocgdb:index>` | A source-level debugger for Linux, based on the GNU Debugger (GDB) |
+| {doc}`ROCdbgapi <rocdbgapi:index>` | The ROCm debugger API library |
 | [rocminfo](https://github.com/RadeonOpenCompute/rocminfo/) | Reports system information |
-| [ROCm SMI](https://github.com/RadeonOpenCompute/rocm_smi_lib/) | A C library for Linux that provides a user space interface for applications to monitor and control GPU applications |
-| [ROCm Validation Suite](https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/) | A tool for detecting and troubleshooting common problems affecting AMD GPUs running in a high-performance computing environment |
-| [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) | A header-only library for HIP parallel primitives |
-| [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/profiler_home_page.html) | A profiling tool for HIP applications |
-| [rocRAND](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) | Provides functions that generate pseudorandom and quasirandom numbers |
+| {doc}`ROCm Performance Primitives Library <rpp:index>` | A comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends |
+| {doc}`ROCm SMI <rocm_smi_lib:index>` | A C library for Linux that provides a user space interface for applications to monitor and control GPU applications |
+| {doc}`ROCm Validation Suite <rocmvalidationsuite:index>` | A tool for detecting and troubleshooting common problems affecting AMD GPUs running in a high-performance computing environment |
+| {doc}`rocPRIM <rocprim:index>` | A header-only library for HIP parallel primitives |
+| {doc}`ROCProfiler <rocprofiler:profiler_home_page>` | A profiling tool for HIP applications |
+| {doc}`rocRAND <rocrand:index>` | Provides functions that generate pseudorandom and quasirandom numbers |
 | [ROCR-Runtime](https://github.com/RadeonOpenCompute/ROCR-Runtime/) | User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents |
-| [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) | An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD’s latest discrete GPUs |
-| [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) | Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language) |
-| [rocThrust](https://rocm.docs.amd.com/projects/rocThrust/en/latest/) | A parallel algorithm library |
+| {doc}`rocSOLVER <rocsolver:index>` | An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD’s latest discrete GPUs |
+| {doc}`rocSPARSE <rocsparse:index>` | Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language) |
+| {doc}`rocThrust <rocthrust:index>` | A parallel algorithm library |
 | [ROCT-Thunk-Interface](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/) | User-mode API interfaces used to interact with the ROCk driver |
-| [ROCTracer](https://rocm.docs.amd.com/projects/roctracer/en/latest/) | Intercepts runtime API calls and traces asynchronous activity |
-| [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/index.html) | A C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations |
+| {doc}`ROCTracer <roctracer:index>` | Intercepts runtime API calls and traces asynchronous activity |
+| {doc}`rocWMMA <rocwmma:index>` | A C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations |
 | [Tensile](https://github.com/ROCmSoftwarePlatform/Tensile) | A tool for creating benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions |
-| [TransferBench](https://rocm.docs.amd.com/projects/TransferBench/en/latest/) | A utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs) |
+| {doc}`TransferBench <transferbench:index>` | A utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs) |
--- a/tools/autotag/util/mivisionx.py
+++ b/tools/autotag/util/mivisionx.py
@@ -34,7 +34,7 @@ def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
 </p>

 ## Online Documentation
-[MIVisionX Documentation](https://gpuopen-professionalcompute-libraries.github.io/MIVisionX)
+[MIVisionX Documentation](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html)
 ## MIVisionX {match['lib_version']}
 {match["body"]}
 {dependency_map}