link updates (#2861)

This commit is contained in:
Lisa
2024-02-08 17:24:12 -07:00
committed by GitHub
parent 82ac21fac5
commit a44f6d1efc
10 changed files with 477 additions and 392 deletions

View File

@@ -1,29 +1,378 @@
AAC
ABI
activations
addr
AddressSanitizer
AlexNet
alloc
allocator
allocators
ALU
AMD
AMDGPU
amdgpu
AMDGPUs
AMDMIGraphX
AMI
AOCC
AOMP
api
APIC
APIs
Arb
ASan
ASIC
ASICs
ASan
ASm
ATI
AddressSanitizer
AlexNet
Arb
BLAS
BMC
BitCode
Blit
Bluefield
CCD
CDNA
CIFAR
CLI
CLion
CMake
CMakeLists
CMakePackage
CP
CPC
CPF
CPP
CPU
CPUs
CSC
CSE
CSV
CSn
CTests
CU
CUDA
CUs
CXX
Cavium
CentOS
ChatGPT
CoRR
Codespaces
Commitizen
CommonMark
Concretized
Conda
ConnectX
DGEMM
DKMS
DL
DMA
DNN
DNNL
DPM
DRI
DW
DWORD
Dask
DataFrame
DataLoader
DataParallel
DeepSpeed
Dependabot
DevCap
Dockerfile
Doxygen
ELMo
ENDPGM
EPYC
ESXi
FFT
FFTs
FFmpeg
FHS
FMA
FP
Filesystem
Flang
Fortran
Fuyu
GALB
GCD
GCDs
GCN
GDB
GDDR
GDR
GDS
GEMM
GEMMs
GFortran
GIM
GL
GLXT
GMI
GPG
GPR
GPT
GPU
GPU's
GPUs
GRBM
GenAI
GenZ
GitHub
Gitpod
HBM
HCA
HIPCC
HIPExtension
HIPIFY
HPC
HPCG
HPE
HPL
HSA
HWE
Haswell
Higgs
Hyperparameters
ICV
IDE
IDEs
IMDb
IOMMU
IOP
IOPM
IOV
IRQ
ISA
ISV
ISVs
ImageNet
InfiniBand
Inlines
IntelliSense
Intersphinx
Intra
Ioffe
JSON
Jupyter
KFD
KVM
Keras
Khronos
LAPACK
LCLK
LDS
LLM
LLMs
LLVM
LM
LSAN
LTS
LoRA
MEM
MERCHANTABILITY
MFMA
MIGraphX
MIOpen
MIOpenGEMM
MIVisionX
MLM
MMA
MMIO
MMIOH
MNIST
MPI
MSVC
MVAPICH
MVFFR
Makefile
Makefiles
Matplotlib
Megatron
Mellanox
Mellanox's
Meta's
MirroredStrategy
Multicore
Multithreaded
MyEnvironment
MyST
NBIO
NBIOs
NIC
NICs
NLI
NLP
NPS
NSP
NUMA
NVCC
NVIDIA
NVPTX
Nano
Navi
Noncoherently
NousResearch's
NumPy
OAM
OAMs
OCP
OEM
OFED
OMP
OMPI
OMPT
OMPX
ONNX
OSS
OSU
OpenCL
OpenCV
OpenFabrics
OpenGL
OpenMP
OpenSSL
OpenVX
PCI
PCIe
PEFT
PIL
PILImage
PRNG
PRs
PaLM
Pageable
PeerDirect
Perfetto
PipelineParallel
PnP
PowerShell
PyPi
PyTorch
Qcycles
RAII
RCCL
RDC
RDMA
RDNA
RHEL
ROC
ROCProfiler
ROCTracer
ROCclr
ROCdbgapi
ROCgdb
ROCk
ROCm
ROCmCC
ROCmSoftwarePlatform
ROCmValidationSuite
ROCr
RST
RW
Radeon
RelWithDebInfo
Req
Rickle
RoCE
Ryzen
SALU
SBIOS
SCA
SDK
SDMA
SDRAM
SENDMSG
SGPR
SGPRs
SHA
SIGQUIT
SIMD
SIMDs
SKU
SKUs
SLES
SMEM
SMI
SMT
SPI
SQs
SRAM
SRAMECC
SVD
SWE
SerDes
Shlens
Skylake
Softmax
Spack
Supermicro
Szegedy
TCA
TCC
TCI
TCIU
TCP
TCR
TFLOPS
TPU
TPUs
TensorBoard
TensorFlow
TensorParallel
ToC
TorchAudio
TorchMIGraphX
TorchScript
TorchServe
TorchVision
TransferBench
TrapStatus
UAC
UC
UCC
UCX
UIF
USM
UTCL
UTIL
Uncached
Unhandled
VALU
VBIOS
VGPR
VGPRs
VM
VMEM
VMWare
VRAM
VSIX
VSkipped
Vanhoucke
Vulkan
WGP
WX
WikiText
Wojna
Workgroups
Writebacks
XGBoost
XGBoost's
XGMI
XT
XTX
Xeon
Xilinx
Xnack
Xteam
YAML
YML
YModel
ZeRO
ZenDNN
accuracies
activations
addr
alloc
allocator
allocators
amdgpu
api
atmi
atomics
autogenerated
@@ -33,68 +382,40 @@ backend
backends
benchmarking
bilinear
BitCode
BLAS
Blit
bitsandbytes
blit
BMC
boson
bosons
buildable
bursty
bzip
cacheable
CCD
cd
CDNA
CentOS
centos
centric
changelog
chiplet
CIFAR
CLI
CLion
CMake
cmake
CMakeLists
CMakePackage
cmd
coalescable
codename
Codespaces
collater
comgr
Commitizen
CommonMark
completers
composable
concretization
Concretized
Conda
config
conformant
convolutional
convolves
CoRR
CP
CPC
CPF
CPP
CPU
CPUs
CSC
CSE
CSn
cpp
csn
CSV
CTests
CU
cuBLAS
CUDA
cuFFT
cuLIB
cuRAND
CUs
cuSOLVER
cuSPARSE
CXX
dataset
datasets
dataspace
@@ -103,150 +424,79 @@ datatypes
dbgapi
de
deallocation
denoise
denoised
denoises
denormalize
Dependabot
deserializers
detections
dev
DevCap
devicelibs
devsel
DGEMM
dimensionality
disambiguates
distro
DL
DMA
DNN
DNNL
Dockerfile
Doxygen
DPM
DRI
DW
DWORD
el
embeddings
enablement
endpgm
env
epilog
EPYC
ESXi
etcetera
ethernet
exascale
executables
ffmpeg
FFT
FFTs
FHS
filesystem
Filesystem
Flang
FMA
Fortran
fortran
FP
galb
gcc
GCD
GCDs
GCN
GDB
gdb
GDDR
GDR
GDS
GEMM
GEMMs
GenZ
gfortran
gfx
GIM
githooks
github
Gitpod
GL
GLXT
GMI
gnupg
GPG
GPR
GPU
GPUs
grayscale
GRBM
gzip
Haswell
HBM
HCA
heterogenous
hipamd
hipBLAS
hipblas
hipBLASLt
HIPCC
hipCUB
hipcub
HIPExtension
hipFFT
hipfft
hipfort
HIPIFY
hipify
hipLIB
hipRAND
hipSOLVER
hipsolver
hipSPARSE
hipsparse
hipSPARSELt
hipTensor
HPC
HPCG
HPE
HPL
HSA
hipamd
hipblas
hipcub
hipfft
hipfort
hipify
hipsolver
hipsparse
hpp
hsa
hsakmt
HWE
hyperparameter
ib_core
ICV
IDE
IDEs
ImageNet
IMDB
inband
incrementing
inferencing
InfiniBand
inflight
init
Inlines
initializer
inlining
installable
IntelliSense
interprocedural
Intersphinx
intra
invariants
invocating
Ioffe
IOMMU
IOP
IOPM
IOV
ipo
IRQ
ISA
ISV
ISVs
JSON
Jupyter
kdb
KFD
Khronos
KVM
LAPACK
LCLK
LDS
libfabric
libjpeg
libs
@@ -254,96 +504,37 @@ linearized
linter
linux
llvm
LLVM
localscratch
logits
lossy
LSAN
LTS
Makefile
Makefiles
macOS
matchers
Matplotlib
Mellanox's
MEM
MERCHANTABILITY
MFMA
microarchitecture
MIGraphX
migraphx
MIOpen
miopen
MIOpenGEMM
miopengemm
MIVisionX
mivisionx
mkdir
mlirmiopen
MMA
MMIO
MMIOH
MNIST
MPI
MSVC
mtypes
Multicore
Multithreaded
MVAPICH
mvffr
MyEnvironment
MyST
namespace
namespaces
Nano
Navi
NBIO
NBIOs
NIC
NICs
Noncoherently
NPS
NUMA
NumPy
numref
NVCC
NVPTX
OAM
OAMs
ocl
OCP
OEM
OFED
OMP
OMPT
OMPX
ONNX
OpenCL
opencl
opencv
OpenFabrics
OpenGL
OpenMP
openmp
openssl
OpenVX
optimizers
os
OSS
OSU
Pageable
pageable
parallelization
parameterization
passthrough
PCI
PCIe
PeerDirect
perfcounter
Perfetto
performant
perl
PIL
PILImage
PowerShell
PnP
pragma
pre
prebuilt
@@ -351,125 +542,69 @@ precompiled
prefetch
prefetchable
preprocess
preprocessed
preprocessing
preq
prequantized
prerequisites
PRNG
profiler
protobuf
PRs
pseudorandom
py
PyPi
PyTorch
Qcycles
quasirandom
queueing
Radeon
RadeonOpenCompute
RCCL
rccl
RDC
rdc
RDMA
RDNA
reStructuredText
reformats
RelWithDebInfo
repos
Req
representativeness
req
resampling
RST
reStructuredText
RHEL
Rickle
rescaling
reusability
roadmap
roc
ROC
RoCE
rocAL
rocALUTION
rocalution
rocBLAS
rocFFT
rocLIB
rocMLIR
rocPRIM
rocRAND
rocSOLVER
rocSPARSE
rocThrust
rocWMMA
rocalution
rocblas
rocclr
ROCdbgapi
rocFFT
rocfft
ROCgdb
ROCk
rocLIB
rocm
ROCm
ROCmCC
rocminfo
rocMLIR
ROCmSoftwarePlatform
ROCmValidationSuite
rocPRIM
rocprim
rocprof
ROCProfiler
rocprofiler
ROCr
rocr
rocRAND
rocrand
rocSOLVER
rocsolver
rocSPARSE
rocsparse
roct
rocThrust
rocthrust
ROCTracer
roctracer
rocWMMA
RST
runtime
runtimes
RW
Ryzen
SALU
SBIOS
SCA
scalability
SDK
SDMA
SDRAM
SENDMSG
sendmsg
SENDMSG
sendmsg
SerDes
serializers
SGPR
SGPRs
SHA
shader
Shlens
sigmoid
SIGQUIT
SIMD
SIMDs
SKU
SKUs
skylake
sL
SLES
scalability
scalable
sendmsg
serializers
shader
sharding
sigmoid
sm
SMEM
SMI
smi
SMT
softmax
Spack
spack
SPI
SQs
SRAM
SRAMECC
src
stochastically
strided
@@ -478,49 +613,23 @@ subexpression
subfolder
subfolders
supercomputing
Supermicro
SWE
Szegedy
tagram
TCA
TCC
TCI
TCIU
TCP
TCR
TensorBoard
TensorFlow
TFLOPS
tg
th
tmp
ToC
tokenization
tokenize
tokenized
tokenizer
tokenizes
toolchain
toolchains
toolset
toolsets
TorchAudio
TorchMIGraphX
TorchScript
TorchServe
TorchVision
torchvision
tqdm
tracebacks
TransferBench
TrapStatus
txt
UAC
uarch
ubuntu
UC
UCC
UCX
UIF
Uncached
uncached
uncorrectable
Unhandled
uninstallation
unsqueeze
unstacking
@@ -532,9 +641,8 @@ USM
UTCL
UTIL
utils
VALU
Vanhoucke
VBIOS
vL
variational
vdi
vectorizable
vectorization
@@ -542,44 +650,20 @@ vectorize
vectorized
vectorizer
vectorizes
VGPR
VGPRs
vjxb
vL
VM
VMEM
VMWare
VRAM
VSIX
VSkipped
Vulkan
walkthrough
walkthroughs
wavefront
wavefronts
WGP
whitespaces
Wojna
workgroup
Workgroups
workgroups
writeback
Writebacks
writebacks
wrreq
WX
wzo
Xeon
XGMI
Xnack
XT
Xteam
XTX
xargs
xz
YAML
yaml
YML
YModel
ysvmadyb
ZenDNN
zypper

View File

@@ -63,15 +63,14 @@ There are also a number of papers which talk about these new capabilities:
* `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
* `PCI express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
* `Intel PCIe Generation 3 Hotchips Paper <https://www.hotchips.org/wp-content/uploads/hc_archives/hc21/1_sun/HC21.23.1.SystemInterconnectTutorial-Epub/HC21.23.131.Ajanovic-Intel-PCIeGen3.pdf>`_
* `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
Other I/O devices with PCIe atomics support
* `Mellanox ConnectX-5 InfiniBand Card <http://www.mellanox.com/related-docs/prod_adapter_cards/PB_ConnectX-5_VPI_Card.pdf>`_
* `Cray Aries Interconnect <http://www.hoti.org/hoti20/slides/Bob_Alverson.pdf>`_
* `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
* `Xilinx 7 Series Devices <https://docs.xilinx.com/v/u/1nfXeFNnGpA0ywyykvWHWQ>`_
Other I/O devices with PCIe atomics support:
* Mellanox ConnectX-5 InfiniBand Card
* Cray Aries Interconnect
* Xilinx 7 Series Devices
Future bus technology with richer I/O atomics operation Support
@@ -80,8 +79,8 @@ Future bus technology with richer I/O atomics operation Support
New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPUs
with PCIe Generation 3.0 support.
* `Mellanox Bluefield SOC <https://docs.nvidia.com/networking/display/BlueFieldSWv25111213/BlueField+Software+Overview>`_
* `Cavium Thunder X2 <https://en.wikichip.org/wiki/cavium/thunderx2>`_
* Mellanox Bluefield SOC
* Cavium Thunder X2
In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU
originates two writes to two different targets:

View File

@@ -22,6 +22,7 @@ Training occurs in multiple phases for every batch of training data. the followi
:::{table} Types of Training Phases
:name: training-phases
:widths: auto
| Types of Phases | |
| ----------------- | --- |
| Forward Pass | The input features are fed into the model, whose parameters may be randomly initialized initially. Activations (outputs) of each layer are retained during this pass to help in the loss gradient computation during the backward pass. |
@@ -35,6 +36,7 @@ Training is different from inference, particularly from the hardware perspective
:::{table} Training vs. Inference
:name: training-inference
:widths: auto
| Training | Inference |
| ----------- | ----------- |
| Training is measured in hours/days. | The inference is measured in minutes. |
@@ -876,7 +878,7 @@ To understand the code step by step, follow these steps:
thisplot[true_label].set_color('blue')
```
9. With the model trained, you can use it to make predictions about some images. Review the 0-th image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.
9. With the model trained, you can use it to make predictions about some images. Review the 0<sup>th</sup> image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label.
```py
i = 0

View File

@@ -288,9 +288,9 @@ The vector L1 cache subsystem counters are further classified into Texture Addre
| `TCP_GATE_EN2[n]` | Cycles | Number of cycles vL1D core clocks are turned on. Value range for n: [0-15]. |
| `TCP_TD_TCP_STALL_CYCLES[n]` | Cycles | Number of cycles TD stalls vL1D. Value range for n: [0-15]. |
| `TCP_TCR_TCP_STALL_CYCLES[n]` | Cycles | Number of cycles TCR stalls vL1D. Value range for n: [0-15]. |
| `TCP_READ_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tagram conflict stalls on a read. Value range for n: [0-15]. |
| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tagram conflict stalls on a write. Value range for n: [0-15]. |
| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tagram conflict stalls on an atomic. Value range for n: [0-15]. |
| `TCP_READ_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tag RAM conflict stalls on a read. Value range for n: [0-15]. |
| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tag RAM conflict stalls on a write. Value range for n: [0-15]. |
| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES[n]` | Cycles | Number of cycles tag RAM conflict stalls on an atomic. Value range for n: [0-15]. |
| `TCP_PENDING_STALL_CYCLES[n]` | Cycles | Number of cycles vL1D cache is stalled due to data pending from L2 Cache. Value range for n: [0-15]. |
| `TCP_TCP_TA_DATA_STALL_CYCLES` | Cycles | Number of cycles TCP stalls TA data interface. |
| `TCP_TA_TCP_STATE_READ[n]` | Req | Number of state reads. Value range for n: [0-15]. |
@@ -454,7 +454,7 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
| `TCC_NORMAL_WRITEBACK_sum` | Total number of writebacks due to requests that are not writeback requests, over all TCC instances. |
| `TCC_NORMAL_EVICT_sum` | Total number of evictions due to requests that are not invalidate or probe requests, over all TCC instances. |
| `TCC_PROBE_sum` | Total number of probe requests over all TCC instances. |
| `TCC_PROBE_ALL_sum` | Total number of external probe requests with EA_TCC_preq_all== 1, over all TCC instances. |
| `TCC_PROBE_ALL_sum` | Total number of external probe requests with `EA_TCC_preq_all== 1`, over all TCC instances. |
| `TCC_READ_sum` | Total number of L2 cache read requests (including compressed reads but not metadata reads) over all TCC instances. |
| `TCC_REQ_sum` | Total number of all types of L2 cache requests over all TCC instances. |
| `TCC_RW_REQ_sum` | Total number of RW requests over all TCC instances. |
@@ -465,11 +465,11 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
| `TCC_WRITE_sum` | Total number of L2 cache write requests over all TCC instances. |
| `TCC_WRITEBACK_sum` | Total number of lines written back to the main memory including writebacks of dirty lines and uncached write/atomic requests, over all TCC instances. |
| `TCC_WRREQ_STALL_max` | Maximum number of cycles a write request is stalled, over all TCC instances. |
| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on an atomic, over all TCP instances. |
| `TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on an atomic, over all TCP instances. |
| `TCP_GATE_EN1_sum` | Total number of cycles vL1D interface clocks are turned on, over all TCP instances. |
| `TCP_GATE_EN2_sum` | Total number of cycles vL1D core clocks are turned on, over all TCP instances. |
| `TCP_PENDING_STALL_CYCLES_sum` | Total number of cycles vL1D cache is stalled due to data pending from L2 Cache, over all TCP instances. |
| `TCP_READ_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a read, over all TCP instances. |
| `TCP_READ_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on a read, over all TCP instances. |
| `TCP_TA_TCP_STATE_READ_sum` | Total number of state reads by all TCP instances. |
| `TCP_TCC_ATOMIC_WITH_RET_REQ_sum` | Total number of atomic requests to L2 cache with return, over all TCP instances. |
| `TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum` | Total number of atomic requests to L2 cache without return, over all TCP instances. |
@@ -504,7 +504,7 @@ L2 Cache is also known as Texture Cache per Channel (TCC).
| `TCP_UTCL1_TRANSLATION_MISS_sum` | Total number of UTCL1 translation misses by all TCP instances. |
| `TCP_UTCL1_TRANSLATION_HIT_sum` | Total number of UTCL1 translation hits by all TCP instances. |
| `TCP_VOLATILE_sum` | Total number of L1 volatile pixels/buffers from TA, over all TCP instances. |
| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tagram conflict stalls on a write, over all TCP instances. |
| `TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum` | Total number of cycles tag RAM conflict stalls on a write, over all TCP instances. |
| `TD_ATOMIC_WAVEFRONT_sum` | Total number of atomic wavefront instructions, over all TD instances. |
| `TD_COALESCABLE_WAVEFRONT_sum` | Total number of coalescable wavefronts according to TA, over all TD instances. |
| `TD_LOAD_WAVEFRONT_sum` | Total number of wavefront instructions (read/write/atomic), over all TD instances. |

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

View File

@@ -13,7 +13,7 @@ the sequential flow for the use of each framework. Refer to the ROCm Compatible
Frameworks Release Notes for each framework's most current release notes at
{doc}`Third-party support<rocm-install-on-linux:reference/3rd-party-support-matrix>`.
![ROCm Compatible Frameworks Flowchart](../data/install/magma-install/magma005.png "ROCm Compatible Frameworks")
![ROCm Compatible Frameworks Flowchart](../data/how-to/magma005.png "ROCm Compatible Frameworks")
## Frameworks installation

View File

@@ -122,8 +122,7 @@ sudo reboot
```
Install the GPU-IOV Module (GIM, where IOV is I/O Virtualization) driver and
follow the steps below. To obtain the GIM driver, write to us
[here](mailto:CloudGPUsupport@amd.com):
follow the steps below.z
```shell
sudo dpkg -i <gim_driver>
@@ -167,6 +166,4 @@ First, assign GPU virtual function (VF) to VM using the following steps.
Then start the VM.
Finally install ROCm on the virtual machine (VM). For detailed instructions,
refer to the {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`. For any
issue encountered during installation, write to us
[here](mailto:CloudGPUsupport@amd.com).
refer to the {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`.

View File

@@ -37,55 +37,58 @@ ROCm consists of the following drivers, development tools, and APIs.
| Project | Description |
| :---------------- | :------------ |
| [AMD Compute Language Runtimes (CLR)](https://github.com/ROCm-Developer-Tools/clr) | Contains source code for AMD's compute languages runtimes: {doc}`HIP <hip:index>` and OpenCL |
| {doc}`AMD SMI <amdsmi:index>` | A C library for Linux that provides a user space interface for applications to monitor and control AMD devices |
| [AOMP](https://github.com/ROCm-Developer-Tools/aomp/) | A scripted build of [LLVM](https://github.com/RadeonOpenCompute/llvm-project) and supporting software |
| [Asynchronous Task and Memory Interface (ATMI)](https://github.com/RadeonOpenCompute/atmi/) | A runtime framework for efficient task management in heterogeneous CPU-GPU systems |
| [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) | A library that aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures |
| {doc}`Composable Kernel <composable_kernel:index>` | A library that aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures |
| [Flang](https://github.com/ROCm-Developer-Tools/flang/) | An out-of-tree Fortran compiler targeting LLVM |
| [Half-precision floating point library (half)](https://github.com/ROCmSoftwarePlatform/half/) | A C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type along with corresponding arithmetic operators, type conversions, and common mathematical functions |
| {doc}`HIP <hip:index>` | AMDs GPU programming language extension and the GPU runtime |
| [hipBLAS](https://rocm.docs.amd.com/projects/hipBLAS/en/latest/) | A BLAS-marshaling library that supports [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/) and cuBLAS backends |
| [HIPCC](https://rocm.docs.amd.com/projects/HIPCC/en/latest/) | A compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure |
| [hipCUB](https://rocm.docs.amd.com/projects/hipCUB/en/latest/) | A thin header-only wrapper library on top of [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) or CUB that allows project porting using the CUB library to the HIP layer |
| [hipFFT](https://rocm.docs.amd.com/projects/hipFFT/en/latest/) | A fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends |
| [hipfort](https://rocm.docs.amd.com/projects/hipfort/en/latest/) | A Fortran interface library for accessing GPU Kernels |
| {doc}`hipBLAS <hipblas:index>` | A BLAS-marshaling library that supports [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/) and cuBLAS backends |
| {doc}`hipBLASLt <hipblaslt:index>` | A library that provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library |
| [hipCC](https://github.com/ROCm/HIPCC) | A compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure |
| {doc}`hipCUB <hipcub:index>` | A thin header-only wrapper library on top of [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) or CUB that allows project porting using the CUB library to the HIP layer |
| {doc}`hipFFT <hipfft:index>` | A fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends |
| {doc}`hipfort <hipfort:index>` | A Fortran interface library for accessing GPU Kernels |
| {doc}`HIPIFY <hipify:index>` | A set of tools for translating CUDA source code into portable HIP C++ |
| [hipify-clang](https://rocm.docs.amd.com/projects/HIPIFY/en/latest/hipify-clang.html) | A Clang-based tool for translating CUDA sources into HIP sources |
| [hipify-perl](https://rocm.docs.amd.com/projects/HIPIFY/en/latest/hipify-perl.html) | An autogenerated, perl-based script that translates CUDA source code into portable HIP C++ |
| [hipSOLVER](https://rocm.docs.amd.com/projects/hipSOLVER/en/latest/) | An LAPACK-marshalling library that supports [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) and cuSOLVER backends |
| [hipSPARSE](https://rocm.docs.amd.com/projects/hipSPARSE/en/latest/) | A SPARSE-marshalling library that supports [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) and cuSPARSE backends |
| [hipTensor](https://rocm.docs.amd.com/projects/hipTensor/en/latest/index.html) | AMD's C++ library for accelerating tensor primitives based on the composable kernel library |
| {doc}`hipRAND <hiprand:index>` | A wrapper library to easily port CUDA applications that use the cuRAND library into the HIP layer |
| {doc}`hipSOLVER <hipsolver:index>` | An LAPACK-marshalling library that supports [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) and cuSOLVER backends |
| {doc}`hipSPARSE <hipsparse:index>` | A SPARSE-marshalling library that supports [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) and cuSPARSE backends |
| {doc}`hipSPARSELt <hipsparselt:index>` | A SPARSE-marshalling library with multiple supported backends |
| {doc}`hipTensor <hiptensor:index>` | AMD's C++ library for accelerating tensor primitives based on the composable kernel library |
| [LLVM](https://github.com/RadeonOpenCompute/llvm-project) | A toolkit for the construction of highly optimized compilers, optimizers, and run-time environments |
| [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/) | A graph inference engine that accelerates machine learning model inference |
| [MIOpen](https://rocm.docs.amd.com/projects/MIOpen/en/latest/) | An open source deep-learning library |
| {doc}`MIGraphX <amdmigraphx:index>` | A graph inference engine that accelerates machine learning model inference |
| {doc}`MIOpen <miopen:index>` | An open source deep-learning library |
| [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) | An OpenCL general matrix multiplication (GEMM) API and kernel generator |
| [MIOpenTensile](https://github.com/ROCmSoftwarePlatform/MIOpenTensile) | Provides host-callable interfaces to Tensile library |
| [MIVisionX](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html) | A set of comprehensive computer vision and machine learning libraries, utilities, and applications |
| {doc}`MIVisionX <mivisionx:doxygen/html/index>` | A set of comprehensive computer vision and machine learning libraries, utilities, and applications |
| [Radeon Compute Profiler (RCP)](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | A performance analysis tool that gathers data from the API run-time and GPU for OpenCL and ROCm/HSA applications |
| [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/) | A standalone library that provides multi-GPU and multi-node collective communication primitives |
| [rocAL](https://rocm.docs.amd.com/projects/rocAL/en/latest/doxygen/html/index.html) | An augmentation library designed to decode and process images and videos |
| [rocALUTION](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/) | A sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains |
| {doc}`RCCL <rccl:index>` | A standalone library that provides multi-GPU and multi-node collective communication primitives |
| {doc}`rocAL <rocal:index>` | An augmentation library designed to decode and process images and videos |
| {doc}`rocALUTION <rocalution:index>` | A sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains |
| [RocBandwidthTest](https://github.com/RadeonOpenCompute/rocm_bandwidth_test/) | Captures the performance characteristics of buffer copying and kernel read/write operations |
| [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/)| A BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains |
| [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/) | A software library for computing fast Fourier transforms (FFTs) written in HIP |
| {doc}`rocBLAS <rocblas:index>` | A BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains |
| {doc}`rocDecode <rocdecode:index>` | A high performance video decode SDK for AMD GPUs |
| {doc}`rocFFT <rocfft:index>` | A software library for computing fast Fourier transforms (FFTs) written in HIP |
| [ROCK-Kernel-Driver](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/) | An AMDGPU Driver with KFD that is used by ROCm |
| [ROCmCC](https://rocm.docs.amd.com/en/latest/reference/rocmcc/rocmcc.html) | A Clang/LLVM-based compiler |
| [ROCmCC](./reference/rocmcc.md) | A Clang/LLVM-based compiler |
| [ROCm cmake](https://github.com/RadeonOpenCompute/rocm-cmake) | A collection of CMake modules for common build and development tasks |
| [ROCm Data Center Tool](https://rocm.docs.amd.com/projects/rdc/en/latest/) | Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments |
| {doc}`ROCm Data Center Tool <rdc:index>` | Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments |
| [ROCm Debug Agent Library (ROCdebug-agent)](https://github.com/ROCm-Developer-Tools/rocr_debug_agent/) | A library that can print the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running |
| [ROCm Debugger (ROCgdb)](https://rocm.docs.amd.com/projects/ROCgdb/en/latest/) | A source-level debugger for Linux, based on the GNU Debugger (GDB) |
| [ROCdbgapi](https://rocm.docs.amd.com/projects/ROCdbgapi/en/latest/) | The ROCm debugger API library |
| {doc}`ROCm debugger (ROCgdb) <rocgdb:index>` | A source-level debugger for Linux, based on the GNU Debugger (GDB) |
| {doc}`ROCdbgapi <rocdbgapi:index>` | The ROCm debugger API library |
| [rocminfo](https://github.com/RadeonOpenCompute/rocminfo/) | Reports system information |
| [ROCm SMI](https://github.com/RadeonOpenCompute/rocm_smi_lib/) | A C library for Linux that provides a user space interface for applications to monitor and control GPU applications |
| [ROCm Validation Suite](https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/) | A tool for detecting and troubleshooting common problems affecting AMD GPUs running in a high-performance computing environment |
| [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) | A header-only library for HIP parallel primitives |
| [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/profiler_home_page.html) | A profiling tool for HIP applications |
| [rocRAND](https://rocm.docs.amd.com/projects/rocRAND/en/latest/) | Provides functions that generate pseudorandom and quasirandom numbers |
| {doc}`ROCm Performance Primitives Library <rpp:index>` | A comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends |
| {doc}`ROCm SMI <rocm_smi_lib:index>` | A C library for Linux that provides a user space interface for applications to monitor and control GPU applications |
| {doc}`ROCm Validation Suite <rocmvalidationsuite:index>` | A tool for detecting and troubleshooting common problems affecting AMD GPUs running in a high-performance computing environment |
| {doc}`rocPRIM <rocprim:index>` | A header-only library for HIP parallel primitives |
| {doc}`ROCProfiler <rocprofiler:profiler_home_page>` | A profiling tool for HIP applications |
| {doc}`rocRAND <rocrand:index>` | Provides functions that generate pseudorandom and quasirandom numbers |
| [ROCR-Runtime](https://github.com/RadeonOpenCompute/ROCR-Runtime/) | User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents |
| [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/) | An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMDs latest discrete GPUs |
| [rocSPARSE](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/) | Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language) |
| [rocThrust](https://rocm.docs.amd.com/projects/rocThrust/en/latest/) | A parallel algorithm library |
| {doc}`rocSOLVER <rocsolver:index>` | An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMDs latest discrete GPUs |
| {doc}`rocSPARSE <rocsparse:index>` | Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language) |
| {doc}`rocThrust <rocthrust:index>` | A parallel algorithm library |
| [ROCT-Thunk-Interface](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/) | User-mode API interfaces used to interact with the ROCk driver |
| [ROCTracer](https://rocm.docs.amd.com/projects/roctracer/en/latest/) | Intercepts runtime API calls and traces asynchronous activity |
| [rocWMMA](https://rocm.docs.amd.com/projects/rocWMMA/en/latest/index.html) | A C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations |
| {doc}`ROCTracer <roctracer:index>` | Intercepts runtime API calls and traces asynchronous activity |
| {doc}`rocWMMA <rocwmma:index>` | A C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations |
| [Tensile](https://github.com/ROCmSoftwarePlatform/Tensile) | A tool for creating benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions |
| [TransferBench](https://rocm.docs.amd.com/projects/TransferBench/en/latest/) | A utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs) |
| {doc}`TransferBench <transferbench:index>` | A utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs) |

View File

@@ -34,7 +34,7 @@ def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
</p>
## Online Documentation
[MIVisionX Documentation](https://gpuopen-professionalcompute-libraries.github.io/MIVisionX)
[MIVisionX Documentation](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html)
## MIVisionX {match['lib_version']}
{match["body"]}
{dependency_map}