mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 22:58:17 -05:00
Compare commits
82 Commits
docs/6.0.2
...
cu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a0ffa933c | ||
|
|
85a6a18930 | ||
|
|
2d002ff907 | ||
|
|
13a91044f6 | ||
|
|
84c12ac1ce | ||
|
|
42849e92a6 | ||
|
|
a5c7a9d01f | ||
|
|
dc3124c1dd | ||
|
|
7f9e31d6d9 | ||
|
|
8f3a3b88aa | ||
|
|
cf76b40b79 | ||
|
|
41dd38168a | ||
|
|
24168eb2c7 | ||
|
|
628cd37aa4 | ||
|
|
76185653cd | ||
|
|
80bb3d6c6b | ||
|
|
35937f7682 | ||
|
|
26afbaa469 | ||
|
|
e73293381b | ||
|
|
90753fa29f | ||
|
|
f7f09f0013 | ||
|
|
2d4a3037ef | ||
|
|
85bc06f697 | ||
|
|
0791f2cbec | ||
|
|
6865f279b4 | ||
|
|
aa47a075b8 | ||
|
|
f7a1915e45 | ||
|
|
2dd253a54c | ||
|
|
3ac6f3b2cc | ||
|
|
5aa5106b99 | ||
|
|
b3867a44bc | ||
|
|
ccce331ad4 | ||
|
|
209038da06 | ||
|
|
e9314a418c | ||
|
|
ffc6c3349f | ||
|
|
e977b783da | ||
|
|
8873de5363 | ||
|
|
e4055682fe | ||
|
|
e84f95f96c | ||
|
|
2db2dac10d | ||
|
|
24cbb957d3 | ||
|
|
9dbb5d578a | ||
|
|
5fde4c2ff7 | ||
|
|
e915b6a741 | ||
|
|
9241c40166 | ||
|
|
10d29ca45b | ||
|
|
678ccdddb9 | ||
|
|
af1623a146 | ||
|
|
0c14b861d2 | ||
|
|
de6b23da83 | ||
|
|
04a314180f | ||
|
|
46e34bef8d | ||
|
|
6d7daee9af | ||
|
|
2ea7ac694e | ||
|
|
3ffd2f78e9 | ||
|
|
4b1574cbe2 | ||
|
|
df6dcac677 | ||
|
|
a29b54a453 | ||
|
|
5ea5d1d3f1 | ||
|
|
da18980f63 | ||
|
|
a6cffe5963 | ||
|
|
18c4cb3ab5 | ||
|
|
01c91ac2ff | ||
|
|
13ad427c8e | ||
|
|
00907151a2 | ||
|
|
75da6927fc | ||
|
|
5bb25f62ed | ||
|
|
645e7a26aa | ||
|
|
2cc67e9a4c | ||
|
|
6ee6dd32f5 | ||
|
|
40c69baf30 | ||
|
|
f298d60976 | ||
|
|
1425bd269c | ||
|
|
22121a9511 | ||
|
|
f39af205f0 | ||
|
|
e7865ebe89 | ||
|
|
cac5df504c | ||
|
|
e6b4715b4f | ||
|
|
a9e4678d8b | ||
|
|
75baa9fd18 | ||
|
|
c84e22937f | ||
|
|
47192a92ba |
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -1,4 +1,4 @@
|
||||
* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus @LisaDelaney
|
||||
* @amd-aakash @jlgreathouse @samjwu @ROCm/rocm-documentation
|
||||
# Documentation files
|
||||
docs/* @ROCm/rocm-documentation
|
||||
*.md @ROCm/rocm-documentation
|
||||
|
||||
@@ -13,6 +13,6 @@ python:
|
||||
- requirements: docs/sphinx/requirements.txt
|
||||
|
||||
build:
|
||||
os: ubuntu-20.04
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
python: "3.10"
|
||||
|
||||
@@ -356,6 +356,7 @@ VSkipped
|
||||
Vanhoucke
|
||||
Vulkan
|
||||
WGP
|
||||
WGPs
|
||||
WX
|
||||
WikiText
|
||||
Wojna
|
||||
|
||||
1840
CHANGELOG.md
1840
CHANGELOG.md
File diff suppressed because it is too large
Load Diff
48
README.md
48
README.md
@@ -19,6 +19,49 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
|
||||
source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
|
||||
(ML) frameworks, such as PyTorch and TensorFlow.
|
||||
|
||||
## Getting the ROCm Source Code
|
||||
|
||||
AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git. For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
|
||||
|
||||
### Installing the repo tool
|
||||
|
||||
The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/bin/
|
||||
curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
|
||||
chmod a+x ~/bin/repo
|
||||
```
|
||||
|
||||
**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
|
||||
|
||||
### Installing git-lfs
|
||||
|
||||
Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
|
||||
|
||||
```bash
|
||||
sudo apt-get install git-lfs
|
||||
```
|
||||
|
||||
### Downloading the ROCm source code
|
||||
|
||||
The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
|
||||
|
||||
```bash
|
||||
mkdir -p ~/ROCm/
|
||||
cd ~/ROCm/
|
||||
~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.0.x
|
||||
~/bin/repo sync
|
||||
```
|
||||
|
||||
**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
|
||||
|
||||
### Building the ROCm source code
|
||||
|
||||
Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
|
||||
|
||||
Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
|
||||
|
||||
## ROCm documentation
|
||||
|
||||
This repository contains the [manifest file](https://gerrit.googlesource.com/git-repo/+/HEAD/docs/manifest-format.md)
|
||||
@@ -32,16 +75,14 @@ Source code for our documentation is located in the `/docs` folder of most ROCm
|
||||
|
||||
The ROCm documentation homepage is [rocm.docs.amd.com](https://rocm.docs.amd.com).
|
||||
|
||||
### Building our documentation
|
||||
### Building the documentation
|
||||
|
||||
For a quick-start build, use the following code. For more options and detail, refer to
|
||||
[Building documentation](./docs/contribute/building.md).
|
||||
|
||||
```bash
|
||||
cd docs
|
||||
|
||||
pip3 install -r sphinx/requirements.txt
|
||||
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
@@ -49,7 +90,6 @@ Alternatively, CMake build is supported.
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
|
||||
cmake --build build --target=doc
|
||||
```
|
||||
|
||||
|
||||
270
RELEASE.md
270
RELEASE.md
@@ -1,4 +1,4 @@
|
||||
# Release notes
|
||||
# ROCm 6.1 release highlights
|
||||
<!-- Disable lints since this is an auto-generated file. -->
|
||||
<!-- markdownlint-disable blanks-around-headers -->
|
||||
<!-- markdownlint-disable no-duplicate-header -->
|
||||
@@ -8,47 +8,245 @@
|
||||
|
||||
<!-- spellcheck-disable -->
|
||||
|
||||
This page contains the release notes for AMD ROCm Software.
|
||||
The ROCm™ 6.1 release consists of new features and fixes to improve the stability and
|
||||
performance of AMD Instinct™ MI300 GPU applications. Notably, we've added:
|
||||
|
||||
-------------------
|
||||
* Full support for Ubuntu 22.04.4.
|
||||
|
||||
## ROCm 6.0.2
|
||||
* **rocDecode**, a new ROCm component that provides high-performance video decode support for
|
||||
AMD GPUs. With rocDecode, you can decode compressed video streams while keeping the resulting
|
||||
YUV frames in video memory. With decoded frames in video memory, you can run video
|
||||
post-processing using ROCm HIP, avoiding unnecessary data copies via the PCIe bus.
|
||||
|
||||
The ROCm 6.0.2 point release consists of minor bug fixes to improve the stability of MI300 GPU applications. This release introduces several new driver features for system qualification on our partner server offerings.
|
||||
To learn more, refer to the rocDecode
|
||||
[documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
|
||||
|
||||
### Library changes in ROCm 6.0.2
|
||||
## OS and GPU support changes
|
||||
|
||||
| Library | Version |
|
||||
|---------|---------|
|
||||
| AMDMIGraphX | ⇒ [2.8](https://github.com/ROCm/AMDMIGraphX/releases/tag/rocm-6.0.2) |
|
||||
| hipBLAS | ⇒ [2.0.0](https://github.com/ROCm/hipBLAS/releases/tag/rocm-6.0.2) |
|
||||
| hipBLASLt | ⇒ [0.6.0](https://github.com/ROCm/hipBLASLt/releases/tag/rocm-6.0.2) |
|
||||
| hipCUB | ⇒ [3.0.0](https://github.com/ROCm/hipCUB/releases/tag/rocm-6.0.2) |
|
||||
| hipFFT | ⇒ [1.0.13](https://github.com/ROCm/hipFFT/releases/tag/rocm-6.0.2) |
|
||||
| hipRAND | ⇒ [2.10.17](https://github.com/ROCm/hipRAND/releases/tag/rocm-6.0.2) |
|
||||
| hipSOLVER | ⇒ [2.0.0](https://github.com/ROCm/hipSOLVER/releases/tag/rocm-6.0.2) |
|
||||
| hipSPARSE | ⇒ [3.0.0](https://github.com/ROCm/hipSPARSE/releases/tag/rocm-6.0.2) |
|
||||
| hipSPARSELt | ⇒ [0.1.0](https://github.com/ROCm/hipSPARSELt/releases/tag/rocm-6.0.2) |
|
||||
| hipTensor | ⇒ [1.1.0](https://github.com/ROCm/hipTensor/releases/tag/rocm-6.0.2) |
|
||||
| MIOpen | ⇒ [2.19.0](https://github.com/ROCm/MIOpen/releases/tag/rocm-6.0.2) |
|
||||
| rccl | ⇒ [2.15.5](https://github.com/ROCm/rccl/releases/tag/rocm-6.0.2) |
|
||||
| rocALUTION | ⇒ [3.0.3](https://github.com/ROCm/rocALUTION/releases/tag/rocm-6.0.2) |
|
||||
| rocBLAS | ⇒ [4.0.0](https://github.com/ROCm/rocBLAS/releases/tag/rocm-6.0.2) |
|
||||
| rocFFT | ⇒ [1.0.25](https://github.com/ROCm/rocFFT/releases/tag/rocm-6.0.2) |
|
||||
| rocm-cmake | ⇒ [0.11.0](https://github.com/ROCm/rocm-cmake/releases/tag/rocm-6.0.2) |
|
||||
| rocPRIM | ⇒ [3.0.0](https://github.com/ROCm/rocPRIM/releases/tag/rocm-6.0.2) |
|
||||
| rocRAND | ⇒ [3.0.0](https://github.com/ROCm/rocRAND/releases/tag/rocm-6.0.2) |
|
||||
| rocSOLVER | ⇒ [3.24.0](https://github.com/ROCm/rocSOLVER/releases/tag/rocm-6.0.2) |
|
||||
| rocSPARSE | ⇒ [3.0.2](https://github.com/ROCm/rocSPARSE/releases/tag/rocm-6.0.2) |
|
||||
| rocThrust | ⇒ [3.0.0](https://github.com/ROCm/rocThrust/releases/tag/rocm-6.0.2) |
|
||||
| rocWMMA | ⇒ [1.3.0](https://github.com/ROCm/rocWMMA/releases/tag/rocm-6.0.2) |
|
||||
| Tensile | ⇒ [4.39.0](https://github.com/ROCm/Tensile/releases/tag/rocm-6.0.2) |
|
||||
ROCm 6.1 adds the following operating system support:
|
||||
|
||||
#### hipFFT 1.0.13
|
||||
* MI300A: Ubuntu 22.04.4 and RHEL 9.3
|
||||
* MI300X: Ubuntu 22.04.4
|
||||
|
||||
hipFFT 1.0.13 for ROCm 6.0.2
|
||||
Future releases will add additional operating systems to match the general offering. For older
|
||||
generations of supported AMD Instinct products, we’ve added Ubuntu 22.04.4 support.
|
||||
|
||||
##### Changes
|
||||
```{tip}
|
||||
To view the complete list of supported GPUs and operating systems, refer to the system requirements
|
||||
page for
|
||||
[Linux](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
|
||||
and
|
||||
[Windows](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html).
|
||||
```
|
||||
|
||||
* Removed the Git submodule for shared files between rocFFT and hipFFT; instead, just copy the files
|
||||
over (this should help simplify downstream builds and packaging)
|
||||
## Installation packages
|
||||
|
||||
This release includes a new set of packages for every module (all libraries and binaries default to
|
||||
`DT_RPATH`). Package names have the suffix `rpath`; for example, the `rpath` variant of `rocminfo` is
|
||||
`rocminfo-rpath`.
|
||||
|
||||
```{warning}
|
||||
The new `rpath` packages will conflict with the default packages; they are meant to be used only in
|
||||
environments where legacy `DT_RPATH` is the preferred form of linking (instead of `DT_RUNPATH`). We
|
||||
do **not** recommend installing both sets of packages.
|
||||
```
|
||||
|
||||
## ROCm components
|
||||
|
||||
The following sections highlight select component-specific changes. For additional details, refer to the
|
||||
[Changelog](https://rocm.docs.amd.com/en/develop/about/CHANGELOG.html).
|
||||
|
||||
### AMD System Management Interface (SMI) Tool
|
||||
|
||||
* **New monitor command for GPU metrics**.
|
||||
Use the monitor command to customize, capture, collect, and observe GPU metrics on
|
||||
target devices.
|
||||
|
||||
* **Integration with E-SMI**.
|
||||
The EPYC™ System Management Interface In-band Library is a Linux C-library that provides in-band
|
||||
user space software APIs to monitor and control your CPU’s power, energy, performance, and other
|
||||
system management functionality. This integration enables access to CPU metrics and telemetry
|
||||
through the AMD SMI API and CLI tools.
|
||||
|
||||
### Composable Kernel (CK)
|
||||
|
||||
* **New architecture support**.
|
||||
CK now supports to the following architectures to enable efficient image denoising on the following
|
||||
AMD GPUs: gfx1030, gfx1100, gfx1031, gfx1101, gfx1032, gfx1102, gfx1034, gfx1103, gfx1035,
|
||||
gfx1036
|
||||
|
||||
* **FP8 rounding logic is replaced with stochastic rounding**.
|
||||
Stochastic rounding mimics a more realistic data behavior and improves model convergence.
|
||||
|
||||
### HIP
|
||||
|
||||
* **New environment variable to enable kernel run serialization**.
|
||||
The default `HIP_LAUNCH_BLOCKING` value is `0` (disable); which causes kernels to run as defined in
|
||||
the queue. When set to `1` (enable), the HIP runtime serializes the kernel queue, which behaves the
|
||||
same as `AMD_SERIALIZE_KERNEL`.
|
||||
|
||||
### hipBLASLt
|
||||
|
||||
* **New GemmTuning extension parameter** GemmTuning allows you to set a split-k value for each solution, which is more feasible for
|
||||
performance tuning.
|
||||
|
||||
### hipFFT
|
||||
|
||||
* **New multi-GPU support for single-process transforms** Multiple GPUs can be used to perform a transform in a single process. Note that this initial
|
||||
implementation is a functional preview.
|
||||
|
||||
### HIPIFY
|
||||
|
||||
* **Skipped code blocks**: Code blocks that are skipped by the preprocessor are no longer hipified under the
|
||||
`--default-preprocessor` option. To hipify everything, despite conditional preprocessor directives
|
||||
(`#if`, `#ifdef`, `#ifndef`, `#elif`, or `#else`), don't use the `--default-preprocessor` or `--amap` options.
|
||||
|
||||
### hipSPARSELt
|
||||
|
||||
* **Structured sparsity matrix support extensions**
|
||||
Structured sparsity matrices help speed up deep-learning workloads. We now support `B` as the
|
||||
sparse matrix and `A` as the dense matrix in Sparse Matrix-Matrix Multiplication (SPMM). Prior to this
|
||||
release, we only supported sparse (matrix A) x dense (matrix B) matrix multiplication. Structured
|
||||
sparsity matrices help speed up deep learning workloads.
|
||||
|
||||
### hipTensor
|
||||
|
||||
* **4D tensor permutation and contraction support**.
|
||||
You can now perform tensor permutation on 4D tensors and 4D contractions for F16, BF16, and
|
||||
Complex F32/F64 datatypes.
|
||||
|
||||
### MIGraphX
|
||||
|
||||
* **Improved performance for transformer-based models**.
|
||||
We added support for FlashAttention, which benefits models like BERT, GPT, and Stable Diffusion.
|
||||
|
||||
* **New Torch-MIGraphX driver**.
|
||||
This driver calls MIGraphX directly from PyTorch. It provides an `mgx_module` object that you can
|
||||
invoke like any other Torch module, but which utilizes the MIGraphX inference engine internally.
|
||||
Torch-MIGraphX supports FP32, FP16, and INT8 datatypes.
|
||||
|
||||
* **FP8 support**. We now offer functional support for inference in the FP8E4M3FNUZ datatype. You
|
||||
can load an ONNX model in FP8E4M3FNUZ using C++ or Python APIs, or `migraphx-driver`.
|
||||
You can quantize a floating point model to FP8 format by using the `--fp8` flag with `migraphx-driver`.
|
||||
To accelerate inference, MIGraphX uses hardware acceleration on MI300 for FP8 by leveraging FP8
|
||||
support in various backend kernel libraries.
|
||||
|
||||
### MIOpen
|
||||
|
||||
* **Improved performance for inference and convolutions**.
|
||||
Inference support now provided for Find 2.0 fusion plans. Additionally, we've enhanced the Number of
|
||||
samples, Height, Width, and Channels (NHWC) convolution kernels for heuristics. NHWC stores data
|
||||
in a format where the height and width dimensions come first, followed by channels.
|
||||
|
||||
### OpenMP
|
||||
|
||||
* **Implicit Zero-copy is triggered automatically in XNACK-enabled MI300A systems**.
|
||||
Implicit Zero-copy behavior in `non unified_shared_memory` programs is triggered automatically in
|
||||
XNACK-enabled MI300A systems (for example, when using the `HSA_XNACK=1` environment
|
||||
variable). OpenMP supports the 'requires `unified_shared_memory`' directive to support programs
|
||||
that don’t want to copy data explicitly between the CPU and GPU. However, this requires that you add
|
||||
these directives to every translation unit of the program.
|
||||
|
||||
* **New MI300 FP atomics**. Application performance can now improve by leveraging fast floating-point atomics on MI300 (gfx942).
|
||||
|
||||
|
||||
### RCCL
|
||||
|
||||
* **NCCL 2.18.6 compatibility**.
|
||||
RCCL is now compatible with NCCL 2.18.6, which includes increasing the maximum IB network interfaces to 32 and fixing network device ordering when creating communicators with only one GPU
|
||||
per node.
|
||||
|
||||
* **Doubled simultaneous communication channels**.
|
||||
We improved MI300X performance by increasing the maximum number of simultaneous
|
||||
communication channels from 32 to 64.
|
||||
|
||||
### rocALUTION
|
||||
|
||||
* **New multiple node and GPU support**.
|
||||
Unsmoothed and smoothed aggregations and Ruge-Stueben AMG now work with multiple nodes
|
||||
and GPUs. For more information, refer to the
|
||||
[API documentation](https://rocm.docs.amd.com/projects/rocALUTION/en/latest/usermanual/solvers.html#unsmoothed-aggregation-amg).
|
||||
|
||||
### rocDecode
|
||||
|
||||
* **New ROCm component**.
|
||||
rocDecode ROCm's newest component, providing high-performance video decode support for AMD
|
||||
GPUs. To learn more, refer to the
|
||||
[documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
|
||||
|
||||
### ROCm Compiler
|
||||
|
||||
* **Combined projects**. ROCm Device-Libs, ROCm Compiler Support, and hipCC are now located in
|
||||
the `llvm-project/amd` subdirectory of AMD's fork of the LLVM project. Previously, these projects
|
||||
were maintained in separate repositories. Note that the projects themselves will continue to be
|
||||
packaged separately.
|
||||
|
||||
* **Split the 'rocm-llvm' package**. This package has been split into a required and an optional package:
|
||||
|
||||
* **rocm-llvm(required)**: A package containing the essential binaries needed for compilation.
|
||||
|
||||
* **rocm-llvm-dev(optional)**: A package containing binaries for compiler and application developers.
|
||||
|
||||
|
||||
### ROCm Data Center Tool (RDC)
|
||||
|
||||
* **C++ upgrades**.
|
||||
RDC was upgraded from C++11 to C++17 to enable a more modern C++ standard when writing RDC plugins.
|
||||
|
||||
### ROCm Performance Primitives (RPP)
|
||||
|
||||
* **New backend support**.
|
||||
Audio processing support added for the `HOST` backend and 3D Voxel kernels support
|
||||
for the `HOST` and `HIP` backends.
|
||||
|
||||
### ROCm Validation Suite
|
||||
|
||||
* **New datatype support**.
|
||||
Added BF16 and FP8 datatypes based on General Matrix Multiply(GEMM) operations in the GPU Stress Test (GST) module. This provides additional performance benchmarking and stress testing based on the newly supported datatypes.
|
||||
|
||||
### rocSOLVER
|
||||
|
||||
* **New EigenSolver routine**.
|
||||
Based on the Jacobi algorithm, a new EigenSolver routine was added to the library. This routine computes the eigenvalues and eigenvectors of a matrix with improved performance.
|
||||
|
||||
### ROCTracer
|
||||
|
||||
* **New versioning and callback enhancements**.
|
||||
Improved to match versioning changes in HIP Runtime and supports runtime API callbacks and activity record logging. The APIs of different runtimes at different levels are considered different API domains with assigned domain IDs.
|
||||
|
||||
## Upcoming changes
|
||||
|
||||
* ROCm SMI will be deprecated in a future release. We advise **migrating to AMD SMI** now to
|
||||
prevent future workflow disruptions.
|
||||
|
||||
* hipCC supports, by default, the following compiler invocation flags:
|
||||
|
||||
* `-mllvm -amdgpu-early-inline-all=true`
|
||||
* `-mllvm -amdgpu-function-calls=false`
|
||||
|
||||
In a future ROCm release, hipCC will no longer support these flags. It will, instead, use the Clang
|
||||
defaults:
|
||||
|
||||
* `-mllvm -amdgpu-early-inline-all=false`
|
||||
* `-mllvm -amdgpu-function-calls=true`
|
||||
|
||||
To evaluate the impact of this change, include `--hipcc-func-supp` in your hipCC invocation.
|
||||
|
||||
For information on these flags, and the differences between hipCC and Clang, refer to
|
||||
[ROCm Compiler Interfaces](https://rocm.docs.amd.com/en/latest/reference/rocmcc.html#rocm-compiler-interfaces).
|
||||
|
||||
* Future ROCm releases will not provide `clang-ocl`. For more information, refer to the
|
||||
[`clang-ocl` README](https://github.com/ROCm/clang-ocl).
|
||||
|
||||
* The following operating systems will be supported in a future ROCm release. They are currently
|
||||
only available in beta.
|
||||
|
||||
* RHEL 9.4
|
||||
* RHEL 8.10
|
||||
* SLES 15 SP6
|
||||
|
||||
* As of ROCm 6.2, we’ve planned for **end-of-support** for:
|
||||
|
||||
* Ubuntu 20.04.5
|
||||
* SLES 15 SP4
|
||||
* RHEL/CentOS 7.9
|
||||
|
||||
69
default.xml
69
default.xml
@@ -2,69 +2,68 @@
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<remote name="KhronosGroup" fetch="https://github.com/KhronosGroup/" />
|
||||
<default revision="refs/tags/rocm-6.0.2"
|
||||
<default revision="refs/tags/rocm-6.1.0"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" />
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="ROCT-Thunk-Interface" />
|
||||
<project name="ROCR-Runtime" />
|
||||
<project name="ROCT-Thunk-Interface" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_smi_lib" />
|
||||
<project name="rocm-core" />
|
||||
<project name="rocm-cmake" />
|
||||
<project name="rocminfo" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocprofiler" />
|
||||
<project name="roctracer" />
|
||||
<project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" revision="6c03f8b58fafd9dd693eaac826749a5cfad515f8" />
|
||||
<project name="clang-ocl" />
|
||||
<project name="rdc" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm_smi_lib" />
|
||||
<project name="rocm-core" />
|
||||
<project name="rocminfo" />
|
||||
<project name="rocprofiler" />
|
||||
<project name="rocprofiler-register" />
|
||||
<project name="roctracer" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIP" />
|
||||
<project name="HIP-Examples" />
|
||||
<project name="HIPIFY" />
|
||||
<project name="clr" />
|
||||
<project name="hipother" />
|
||||
<project name="HIPIFY" />
|
||||
<project name="HIPCC" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="ROCm-Device-Libs" />
|
||||
<project name="ROCm-CompilerSupport" />
|
||||
<project name="half" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCgdb" />
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="rocBLAS" />
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIOpen" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="Tensile" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipBLAS" />
|
||||
<project groups="mathlibs" name="hipBLASLt" />
|
||||
<project groups="mathlibs" name="rocFFT" />
|
||||
<project groups="mathlibs" name="hipCUB" />
|
||||
<project groups="mathlibs" name="hipFFT" />
|
||||
<project groups="mathlibs" name="rocRAND" />
|
||||
<project groups="mathlibs" name="hipRAND" />
|
||||
<project groups="mathlibs" name="rocSPARSE" />
|
||||
<project groups="mathlibs" name="hipSPARSELt" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipSPARSE" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocThrust" />
|
||||
<project groups="mathlibs" name="hipCUB" />
|
||||
<project groups="mathlibs" name="rocPRIM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="hipSPARSELt" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project name="MIOpen" />
|
||||
<project name="composable_kernel" />
|
||||
<project name="MIVisionX" />
|
||||
<project name="rpp" />
|
||||
<project name="hipfort" />
|
||||
<project name="AMDMIGraphX" />
|
||||
<project name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocBLAS" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocFFT" />
|
||||
<project groups="mathlibs" name="rocPRIM" />
|
||||
<project groups="mathlibs" name="rocRAND" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocSPARSE" />
|
||||
<project groups="mathlibs" name="rocThrust" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
.. meta::
|
||||
:description: Supported data types in ROCm
|
||||
:keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float, float32, float64, double, AMD, ROCm, AMDGPU
|
||||
|
||||
.. _rocm-supported-data-types:
|
||||
:keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float,
|
||||
float32, float64, double, AMD, ROCm, AMDGPU
|
||||
|
||||
*************************************************************
|
||||
ROCm data type specifications
|
||||
Precision support
|
||||
*************************************************************
|
||||
|
||||
Use the following sections to identify data types and HIP types ROCm™ supports.
|
||||
|
||||
Integral types
|
||||
==========================================
|
||||
|
||||
The signed and unsigned integral types that are supported by ROCm™ are listed in the following table,
|
||||
The signed and unsigned integral types that are supported by ROCm are listed in the following table,
|
||||
together with their corresponding HIP type and a short description.
|
||||
|
||||
|
||||
@@ -403,37 +404,37 @@ description, refer to the corresponding library data type support page.
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details<rocrand:data-type-support>`)
|
||||
- rocRAND (:doc:`details <rocrand:data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- hipRAND (:doc:`details<hiprand:data-type-support>`)
|
||||
- hipRAND (:doc:`details <hiprand:data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details<rocprim:data-type-support>`)
|
||||
- rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details<hipcub:data-type-support>`)
|
||||
- hipCUB (:doc:`details <hipcub:data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details<rocthrust:data-type-support>`)
|
||||
- rocThrust (:doc:`details <rocthrust:data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
@@ -455,7 +456,7 @@ description, refer to the corresponding library data type support page.
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -464,7 +465,7 @@ description, refer to the corresponding library data type support page.
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details<rocrand:data-type-support>`)
|
||||
- rocRAND (:doc:`details <rocrand:data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
@@ -473,7 +474,7 @@ description, refer to the corresponding library data type support page.
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- hipRAND (:doc:`details<hiprand:data-type-support>`)
|
||||
- hipRAND (:doc:`details <hiprand:data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
@@ -482,7 +483,7 @@ description, refer to the corresponding library data type support page.
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details<rocprim:data-type-support>`)
|
||||
- rocPRIM (:doc:`details <rocprim:reference/data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -491,7 +492,7 @@ description, refer to the corresponding library data type support page.
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details<hipcub:data-type-support>`)
|
||||
- hipCUB (:doc:`details <hipcub:data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
@@ -500,7 +501,7 @@ description, refer to the corresponding library data type support page.
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details<rocthrust:data-type-support>`)
|
||||
- rocThrust (:doc:`details <rocthrust:data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ⚠️/⚠️
|
||||
@@ -531,7 +532,7 @@ description, refer to the corresponding library data type support page.
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
@@ -554,7 +555,7 @@ description, refer to the corresponding library data type support page.
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- hipSPARSELt (:doc:`details <hipsparselt:reference/data-type-support>`)
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
47
docs/conceptual/setting-cus.rst
Normal file
47
docs/conceptual/setting-cus.rst
Normal file
@@ -0,0 +1,47 @@
|
||||
.. meta::
|
||||
:description: Setting the number of CUs
|
||||
:keywords: AMD, ROCm, cu, number of cus
|
||||
|
||||
.. _env-variables-reference:
|
||||
|
||||
*************************************************************
|
||||
Setting the number of CUs
|
||||
*************************************************************
|
||||
|
||||
When using GPUs to accelerate compute workloads, it sometimes becomes necessary
|
||||
to configure the usage of Compute Units (CU) of the hardware. This is a more advanced
|
||||
option, so please read this page before experimentation.
|
||||
|
||||
The GPU driver provides two environment variables to set the number of CUs used. The
|
||||
first one is ``HSA_CU_MASK`` and the second one is ``ROC_GLOBAL_CU_MASK``. The main
|
||||
difference is, that ``ROC_GLOBAL_CU_MASK`` sets the CU mask on queues created by
|
||||
the HIP or the OpenCL runtimes. While ``HSA_CU_MASK`` sets the mask on a lower level of
|
||||
queue creation in the driver, this mask will also be set for queues being profiled.
|
||||
|
||||
The environment variables have the following syntax:
|
||||
|
||||
::
|
||||
|
||||
ID = [0-9][0-9]* ex. base 10 numbers
|
||||
ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7
|
||||
GPU_list = ID_list ex. 0,2-4,7
|
||||
CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7
|
||||
CU_Set = GPU_list : CU_list ex. 0,2-4,7:0-15,32-47 OR 0,2-4,7:0x337F
|
||||
HSA_CU_MASK = CU_Set [; CU_Set]* ex. 0,2-4,7:0-15,32-47; 3-9:0x337F
|
||||
|
||||
The GPU indices are taken post ``ROCR_VISIBLE_DEVICES`` reordering. For GPUs listed,
|
||||
the listed or masked CUs will be enabled, the rest disabled. Unlisted GPUs will not
|
||||
be affected, their CUs will all be enabled.
|
||||
|
||||
The parsing of the variable is stopped when a syntax error occurs. The erroneous set
|
||||
and the ones following will be ignored. Repeating GPU or CU IDs are a syntax error.
|
||||
Specifying a mask with no usable CUs (CU_list is 0x0) is a syntax error. For excluding
|
||||
GPU devices use ``ROCR_VISIBLE_DEVICES``.
|
||||
|
||||
These environment variables only affect ROCm software, not graphics applications.
|
||||
|
||||
It's important to know that not all CU configurations are valid on all devices. For
|
||||
instance, on devices where two CUs can be combined into a WGP (for kernels running in
|
||||
WGP mode), it is not valid to disable only a single CU in a WGP. `This paper
|
||||
<https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ can provide more information
|
||||
about what to expect, when disabling CUs.
|
||||
@@ -8,15 +8,12 @@
|
||||
# Using the LLVM ASan on a GPU (beta release)
|
||||
|
||||
The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.
|
||||
|
||||
Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
|
||||
|
||||
This document provides documentation on using ROCm ASan.
|
||||
|
||||
For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).
|
||||
|
||||
:::{note}
|
||||
The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
|
||||
:::
|
||||
**Note:** The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
|
||||
|
||||
## Compiling for ASan
|
||||
|
||||
@@ -26,13 +23,20 @@ Recommendations for doing this are:
|
||||
|
||||
* Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
|
||||
* Add the following options to the existing compiler and linker options:
|
||||
|
||||
* `-fsanitize=address` - enables instrumentation
|
||||
|
||||
* `-shared-libsan` - use shared version of runtime
|
||||
|
||||
* `-g` - add debug info for improved reporting
|
||||
|
||||
* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
|
||||
|
||||
Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.
|
||||
|
||||
It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
|
||||
**Note:** It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
|
||||
|
||||
**Note:** When compiling OpenMP programs with ASan instrumentation, it is currently necessary to set the environment variable `LIBRARY_PATH` to `/opt/rocm-<version>/lib/llvm/lib/asan:/opt/rocm-<version>/lib/asan`. At runtime, it may be necessary to add `/opt/rocm-<version>/lib/llvm/lib/asan` to `LD_LIBRARY_PATH`.
|
||||
|
||||
### About compilation time
|
||||
|
||||
@@ -56,7 +60,7 @@ For a complete ROCm GPU Sanitizer installation, including packages, instrumented
|
||||
## Using AMD-supplied ASan instrumented libraries
|
||||
|
||||
ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
|
||||
These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with g++. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
|
||||
These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with `g++`. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
|
||||
|
||||
These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
|
||||
|
||||
@@ -92,9 +96,10 @@ There are two `ASAN_OPTION` flags of particular note.
|
||||
|
||||
* `halt_on_error=0/1 default 1`.
|
||||
|
||||
This tells the ASAN runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
|
||||
This tells the ASan runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
|
||||
|
||||
* `detect_leaks=0/1 default 1`.
|
||||
|
||||
This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSAN). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSAN suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
|
||||
|
||||
## Runtime overhead
|
||||
@@ -235,9 +240,167 @@ $ rocgdb <path to application>
|
||||
|
||||
### Using ASan with a short HIP application
|
||||
|
||||
Refer to the following example to use ASan with a short HIP application,
|
||||
Consider the following simple and short demo of using the Address Sanitizer with a HIP application:
|
||||
|
||||
https://github.com/Rmalavally/rocm-examples/blob/Rmalavally-patch-1/LLVM_ASAN/Using-Address-Sanitizer-with-a-Short-HIP-Application.md
|
||||
```C++
|
||||
|
||||
#include <cstdlib>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
__global__ void
|
||||
set1(int *p)
|
||||
{
|
||||
int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
p[i] = 1;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int m = std::atoi(argv[1]);
|
||||
int n1 = std::atoi(argv[2]);
|
||||
int n2 = std::atoi(argv[3]);
|
||||
int c = std::atoi(argv[4]);
|
||||
int *dp;
|
||||
hipMalloc(&dp, m*sizeof(int));
|
||||
hipLaunchKernelGGL(set1, dim3(n1), dim3(n2), 0, 0, dp);
|
||||
int *hp = (int*)malloc(c * sizeof(int));
|
||||
hipMemcpy(hp, dp, m*sizeof(int), hipMemcpyDeviceToHost);
|
||||
hipDeviceSynchronize();
|
||||
hipFree(dp);
|
||||
free(hp);
|
||||
std::puts("Done.");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
This application will attempt to access invalid addresses for certain command line arguments. In particular, if `m < n1 * n2` some device threads will attempt to access
|
||||
unallocated device memory.
|
||||
|
||||
Or, if `c < m`, the `hipMemcpy` function will copy past the end of the `malloc` allocated memory.
|
||||
|
||||
**Note**: The `hipcc` compiler is used here for simplicity.
|
||||
|
||||
Compiling without XNACK results in a warning.
|
||||
|
||||
```bash
|
||||
$ hipcc -g --offload-arch=gfx90a:xnack- -fsanitize=address -shared-libsan mini.hip -o mini
|
||||
clang++: warning: ignoring` `-fsanitize=address' option for offload arch 'gfx90a:xnack-`, as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead [-Woption-ignored]`.
|
||||
```
|
||||
|
||||
The binary compiled above will run, but the GPU code will not be instrumented and the `m < n1 * n2` error will not be detected. Switching to `--offload-arch=gfx90a:xnack+` in the command above results in a warning-free compilation and an instrumented application. After setting `PATH`, `LD_LIBRARY_PATH` and `HSA_XNACK` as described earlier, a check of the binary with `ldd` yields the following,
|
||||
|
||||
```bash
|
||||
$ ldd mini
|
||||
linux-vdso.so.1 (0x00007ffd1a5ae000)
|
||||
libclang_rt.asan-x86_64.so => /opt/rocm-6.1.0-99999/llvm/lib/clang/17.0.0/lib/linux/libclang_rt.asan-x86_64.so (0x00007fb9c14b6000)
|
||||
libamdhip64.so.5 => /opt/rocm-6.1.0-99999/lib/asan/libamdhip64.so.5 (0x00007fb9bedd3000)
|
||||
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fb9beba8000)
|
||||
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fb9bea59000)
|
||||
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fb9bea3e000)
|
||||
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fb9be84a000)
|
||||
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fb9be844000)
|
||||
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fb9be821000)
|
||||
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fb9be817000)
|
||||
libamd_comgr.so.2 => /opt/rocm-6.1.0-99999/lib/asan/libamd_comgr.so.2 (0x00007fb9b4382000)
|
||||
libhsa-runtime64.so.1 => /opt/rocm-6.1.0-99999/lib/asan/libhsa-runtime64.so.1 (0x00007fb9b3b00000)
|
||||
libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007fb9b3af3000)
|
||||
/lib64/ld-linux-x86-64.so.2 (0x00007fb9c2027000)
|
||||
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fb9b3ad7000)
|
||||
libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fb9b3aa7000)
|
||||
libelf.so.1 => /lib/x86_64-linux-gnu/libelf.so.1 (0x00007fb9b3a89000)
|
||||
libdrm.so.2 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm.so.2 (0x00007fb9b3a70000)
|
||||
libdrm_amdgpu.so.1 => /opt/amdgpu/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1 (0x00007fb9b3a62000)
|
||||
|
||||
```
|
||||
|
||||
This confirms that the address sanitizer runtime is linked in, and the ASAN instrumented version of the runtime libraries are used.
|
||||
Checking the `PATH` yields
|
||||
|
||||
```bash
|
||||
$ which llvm-symbolizer
|
||||
/opt/rocm-6.1.0-99999/llvm/bin/llvm-symbolizer
|
||||
```
|
||||
|
||||
Lastly, a check of the OS kernel version yields
|
||||
|
||||
```bash
|
||||
$ uname -rv
|
||||
5.15.0-73-generic #80~20.04.1-Ubuntu SMP Wed May 17 14:58:14 UTC 2023
|
||||
```
|
||||
|
||||
which indicates that the required HMM support (kernel version > 5.6) is available. This completes the necessary setup. Running with `m = 100`, `n1 = 11`, `n2 = 10` and `c = 100` should produce
|
||||
a report for an invalid access by the last 10 threads.
|
||||
|
||||
```bash
|
||||
=================================================================
|
||||
==3141==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fb1410d2cc4
|
||||
WRITE of size 4 in workgroup id (10,0,0)
|
||||
#0 0x7fb1410d2cc4 in set1(int*) at /home/dave/mini/mini.cpp:0:10
|
||||
|
||||
Thread ids and accessed addresses:
|
||||
00 : 0x7fb14371d190 01 : 0x7fb14371d194 02 : 0x7fb14371d198 03 : 0x7fb14371d19c 04 : 0x7fb14371d1a0 05 : 0x7fb14371d1a4 06 : 0x7fb14371d1a8 07 : 0x7fb14371d1ac
|
||||
08 : 0x7fb14371d1b0 09 : 0x7fb14371d1b4
|
||||
|
||||
0x7fb14371d190 is located 0 bytes after 400-byte region [0x7fb14371d000,0x7fb14371d190)
|
||||
allocated by thread T0 here:
|
||||
#0 0x7fb151c76828 in hsa_amd_memory_pool_allocate /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:692:3
|
||||
#1 ...
|
||||
|
||||
#12 0x7fb14fb99ec4 in hipMalloc /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:568:3
|
||||
#13 0x226630 in hipError_t hipMalloc<int>(int**, unsigned long) /opt/rocm-6.1.0-99999/include/hip/hip_runtime_api.h:8367:12
|
||||
#14 0x226630 in main /home/dave/mini/mini.cpp:19:5
|
||||
#15 0x7fb14ef02082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
|
||||
|
||||
Shadow bytes around the buggy address:
|
||||
0x7fb14371cf00: ...
|
||||
|
||||
=>0x7fb14371d180: 00 00[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa
|
||||
0x7fb14371d200: ...
|
||||
|
||||
Shadow byte legend (one shadow byte represents 8 application bytes):
|
||||
Addressable: 00
|
||||
Partially addressable: 01 02 03 04 05 06 07
|
||||
Heap left redzone: fa
|
||||
...
|
||||
==3141==ABORTING
|
||||
```
|
||||
|
||||
Running with `m = 100`, `n1 = 10`, `n2 = 10` and `c = 99` should produce a report for an invalid copy.
|
||||
|
||||
```shell
|
||||
=================================================================
|
||||
==2817==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x514000150dcc at pc 0x7f5509551aca bp 0x7ffc90a7ae50 sp 0x7ffc90a7a610
|
||||
WRITE of size 400 at 0x514000150dcc thread T0
|
||||
#0 0x7f5509551ac9 in __asan_memcpy /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3
|
||||
#1 ...
|
||||
|
||||
#9 0x7f5507462a28 in hipMemcpy_common(void*, void const*, unsigned long, hipMemcpyKind, ihipStream_t*) /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:637:10
|
||||
#10 0x7f5507464205 in hipMemcpy /work/dave/git/compute/external/clr/hipamd/src/hip_memory.cpp:642:3
|
||||
#11 0x226844 in main /home/dave/mini/mini.cpp:22:5
|
||||
#12 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
|
||||
#13 0x22605d in _start (/home/dave/mini/mini+0x22605d)
|
||||
|
||||
0x514000150dcc is located 0 bytes after 396-byte region [0x514000150c40,0x514000150dcc)
|
||||
allocated by thread T0 here:
|
||||
#0 0x7f5509553dcf in malloc /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
|
||||
#1 0x226817 in main /home/dave/mini/mini.cpp:21:21
|
||||
#2 0x7f55067c3082 in __libc_start_main /build/glibc-SzIz7B/glibc-2.31/csu/../csu/libc-start.c:308:16
|
||||
|
||||
SUMMARY: AddressSanitizer: heap-buffer-overflow /work/dave/git/compute/external/llvm-project/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp:61:3 in __asan_memcpy
|
||||
Shadow bytes around the buggy address:
|
||||
0x514000150b00: ...
|
||||
|
||||
=>0x514000150d80: 00 00 00 00 00 00 00 00 00[04]fa fa fa fa fa fa
|
||||
0x514000150e00: ...
|
||||
|
||||
Shadow byte legend (one shadow byte represents 8 application bytes):
|
||||
Addressable: 00
|
||||
Partially addressable: 01 02 03 04 05 06 07
|
||||
Heap left redzone: fa
|
||||
...
|
||||
==2817==ABORTING
|
||||
```
|
||||
|
||||
### Known issues with using GPU sanitizer
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ exclude_patterns = ['temp']
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
extensions = ["rocm_docs"]
|
||||
extensions = ["rocm_docs", "sphinx_reredirects"]
|
||||
|
||||
external_projects_current_project = "rocm"
|
||||
|
||||
@@ -103,3 +103,7 @@ html_title = "ROCm Documentation"
|
||||
html_theme_options = {
|
||||
"link_main_doc": False
|
||||
}
|
||||
|
||||
redirects = {
|
||||
"reference/openmp/openmp": "../../about/compatibility/openmp.html"
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ To make edits to our documentation via PR, follow these steps:
|
||||
git clone git@github.com:ROCm/ROCm.git
|
||||
```
|
||||
|
||||
* Add your fork to this local copy of the repository. Run:
|
||||
* Optionally add your fork to this local copy of the repository by running:
|
||||
|
||||
```bash
|
||||
git add remote <name-of-my-fork> <git@github.com:my-username/ROCm.git>
|
||||
@@ -45,43 +45,33 @@ To make edits to our documentation via PR, follow these steps:
|
||||
To get the URL of your fork, go to your GitHub profile, select the fork and click the green 'Code'
|
||||
button (the same process you followed to get the main GitHub repository URL).
|
||||
|
||||
4. Check out the **develop** branch and run 'git pull' (and/or 'git pull origin develop' to ensure your
|
||||
local version has the most recent content.
|
||||
4. Change directory into your local copy of the repository, and run ``git pull`` (or ``git pull origin develop``) to ensure your local copy has the most recent content.
|
||||
|
||||
5. Create a new branch.
|
||||
5. Create and checkout a new branch using the following command:
|
||||
|
||||
```bash
|
||||
git checkout -b my-new-branch
|
||||
git checkout -b <branch_name>
|
||||
```
|
||||
|
||||
6. Make your changes locally using your preferred code editor. Follow the guidelines listed on the
|
||||
6. Change directory into the `./docs` folder and make any documentation changes locally using your preferred code editor. Follow the guidelines listed on the
|
||||
[documentation structure](./doc-structure.md) page.
|
||||
|
||||
7. (optional) We recommend running a local test build to ensure the content looks the way you expect.
|
||||
|
||||
In your terminal, run the following commands from within your cloned repository:
|
||||
7. Optionally run a local test build of the documentation to ensure the content builds and looks as expected. In your terminal, run the following commands from within the `./docs` folder of your cloned repository:
|
||||
|
||||
```bash
|
||||
cd docs/ # The other commands are run from within the ./docs folder
|
||||
|
||||
pip3 install -r sphinx/requirements.txt # You only need to run this command once
|
||||
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
The build files are located in the `docs/_build` folder. To preview your build, open the index file
|
||||
(`docs/_build/html/index.html`) file. For more information, see
|
||||
[Building documentation](building.md). To learn
|
||||
more about our build tools, see
|
||||
[Documentation toolchain](toolchain.md).
|
||||
The build output files are located in the `docs/_build` folder. To preview your build, open the index file
|
||||
(`docs/_build/html/index.html`) file. For more information, see [Building documentation](building.md). To learn
|
||||
more about our build tools, see [Documentation toolchain](toolchain.md).
|
||||
|
||||
8. Commit your changes and push them to GitHub. Run:
|
||||
8. Commit your changes and push them to GitHub by running:
|
||||
|
||||
```bash
|
||||
git add <path-to-my-modified-file> # To add all modified files, you can use: git add .
|
||||
|
||||
git commit -m "my-updates"
|
||||
|
||||
git push <name-of-my-fork>
|
||||
```
|
||||
|
||||
|
||||
BIN
docs/data/rocm-software-stack-6_1_0.jpg
Normal file
BIN
docs/data/rocm-software-stack-6_1_0.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 250 KiB |
@@ -260,5 +260,5 @@ To run an OSU benchmark using multiple nodes, use the following code:
|
||||
.. code-block:: shell
|
||||
|
||||
export LD_LIBRARY_PATH=$OMPI_DIR/lib:$OFI_DIR/lib64:/opt/rocm/lib
|
||||
$OMPI_DIR/bin/mpirun -np 2 \
|
||||
$OMPI_DIR/bin/mpirun --mca pml ob1 --mca btl_ofi_mode 2 -np 2 \
|
||||
./c/mpi/pt2pt/standard/osu_bw D D
|
||||
|
||||
@@ -31,7 +31,7 @@ Our documentation is organized into the following categories:
|
||||
:padding: 2
|
||||
|
||||
* Linux
|
||||
* {doc}`Quick-start (Linux)<rocm-install-on-linux:tutorial/quick-start>`
|
||||
* {doc}`Quick start guide<rocm-install-on-linux:tutorial/quick-start>`
|
||||
* {doc}`Linux install guide<rocm-install-on-linux:how-to/native-install/index>`
|
||||
* {doc}`Package manager integration<rocm-install-on-linux:how-to/native-install/package-manager-integration>`
|
||||
* Windows
|
||||
@@ -40,6 +40,7 @@ Our documentation is organized into the following categories:
|
||||
* {doc}`Install Docker containers<rocm-install-on-linux:how-to/docker>`
|
||||
* {doc}`PyTorch for ROCm<rocm-install-on-linux:how-to/3rd-party/pytorch-install>`
|
||||
* {doc}`TensorFlow for ROCm<rocm-install-on-linux:how-to/3rd-party/tensorflow-install>`
|
||||
* {doc}`JAX for ROCm<rocm-install-on-linux:how-to/3rd-party/jax-install>`
|
||||
* {doc}`MAGMA for ROCm<rocm-install-on-linux:how-to/3rd-party/magma-install>`
|
||||
* {doc}`ROCm & Spack<rocm-install-on-linux:how-to/spack>`
|
||||
:::
|
||||
@@ -56,10 +57,11 @@ Our documentation is organized into the following categories:
|
||||
* {doc}`User/kernel space<rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`
|
||||
* {doc}`Docker<rocm-install-on-linux:reference/docker-image-support-matrix>`
|
||||
* [OpenMP](./about/compatibility/openmp.md)
|
||||
* [Precision support](./about/compatibility/data-type-support.rst)
|
||||
* [Precision support](./about/compatibility/precision-support.rst)
|
||||
* {doc}`ROCm on Radeon GPUs<radeon:index>`
|
||||
:::
|
||||
|
||||
<!-- markdownlint-disable MD051 -->
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ./data/banner-reference.jpg
|
||||
@@ -67,20 +69,19 @@ Our documentation is organized into the following categories:
|
||||
:padding: 2
|
||||
|
||||
* [API libraries](./reference/api-libraries.md)
|
||||
* Artificial intelligence
|
||||
* C++ primitives
|
||||
* Communication
|
||||
* Fast Fourier transforms
|
||||
* HIP
|
||||
* Linear algebra
|
||||
* Random number generators
|
||||
* [Artificial intelligence](#artificial-intelligence-apis)
|
||||
* [C++ primitives](#cpp-primitives)
|
||||
* [Communication](#communication-libraries)
|
||||
* [Math](#math-apis)
|
||||
* [Random number generators](#random-number-apis)
|
||||
* [HIP runtime](#hip-runtime)
|
||||
* [Tools](./reference/rocm-tools.md)
|
||||
* Development
|
||||
* Performance analysis
|
||||
* System
|
||||
* [GPU architectures](./reference/gpu-arch.rst)
|
||||
* [GPU architecture hardware specification overview](./reference/gpu-arch/gpu-arch-spec-overview.rst)
|
||||
* [Development](#development-tools)
|
||||
* [Performance analysis](#performance-analysis)
|
||||
* [System](#system-tools)
|
||||
* [Hardware specifications](./reference/gpu-arch-specs.rst)
|
||||
:::
|
||||
<!-- markdownlint-enable MD051 -->
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
@@ -109,6 +110,7 @@ Our documentation is organized into the following categories:
|
||||
* [MI250](./conceptual/gpu-arch/mi250.md)
|
||||
* [MI300](./conceptual/gpu-arch/mi300.md)
|
||||
* [GPU memory](./conceptual/gpu-memory.md)
|
||||
* [Setting the number of CUs](./conceptual/setting-cus.md)
|
||||
* [Compiler disambiguation](./conceptual/compiler-disambiguation.md)
|
||||
* [File structure (Linux FHS)](./conceptual/file-reorg.md)
|
||||
* [GPU isolation techniques](./conceptual/gpu-isolation.md)
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
::::{grid} 1 2 2 2
|
||||
:class-container: rocm-doc-grid
|
||||
|
||||
(artificial-intelligence-apis)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-ai.jpg
|
||||
@@ -21,9 +23,13 @@
|
||||
* {doc}`MIGraphX <amdmigraphx:index>`
|
||||
* {doc}`MIOpen <miopen:index>`
|
||||
* {doc}`MIVisionX <mivisionx:doxygen/html/index>`
|
||||
* [ROCm Performance Primitives (RPP)](https://rocm.docs.amd.com/projects/rpp/en/latest/)
|
||||
* {doc}`rocAL <rocal:index>`
|
||||
* {doc}`rocDecode <rocdecode:index>`
|
||||
* {doc}`ROCm Performance Primitives (RPP) <rpp:index>`
|
||||
:::
|
||||
|
||||
(cpp-primitives)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-cpp-primitives.jpg
|
||||
@@ -36,6 +42,8 @@
|
||||
* {doc}`rocThrust <rocthrust:index>`
|
||||
:::
|
||||
|
||||
(communication-libraries)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-communication.jpg
|
||||
@@ -45,6 +53,8 @@
|
||||
* {doc}`RCCL <rccl:index>`
|
||||
:::
|
||||
|
||||
(hip-runtime)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-hip.jpg
|
||||
@@ -55,6 +65,8 @@
|
||||
* {doc}`HIPIFY <hipify:index>`
|
||||
:::
|
||||
|
||||
(math-apis)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-math.jpg
|
||||
@@ -65,7 +77,7 @@
|
||||
* {doc}`hipBLAS <hipblas:index>` / {doc}`rocBLAS <rocblas:index>`
|
||||
* {doc}`hipBLASLt <hipblaslt:index>`
|
||||
* {doc}`hipFFT <hipfft:index>` / {doc}`rocFFT <rocfft:index>`
|
||||
* [hipfort](https://rocm.docs.amd.com/projects/hipfort/en/latest/)
|
||||
* {doc}`hipfort <hipfort:index>`
|
||||
* {doc}`hipSOLVER <hipsolver:index>` / {doc}`rocSOLVER <rocsolver:index>`
|
||||
* {doc}`hipSPARSE <hipsparse:index>` / {doc}`rocSPARSE <rocsparse:index>`
|
||||
* {doc}`hipSPARSELt <hipsparselt:index>`
|
||||
@@ -74,6 +86,8 @@
|
||||
* [Tensile](https://github.com/ROCm/Tensile)
|
||||
:::
|
||||
|
||||
(random-number-apis)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-random-number.jpg
|
||||
|
||||
661
docs/reference/gpu-arch-specs.rst
Normal file
661
docs/reference/gpu-arch-specs.rst
Normal file
@@ -0,0 +1,661 @@
|
||||
.. meta::
|
||||
:description: AMD Instinct™ accelerator, AMD Radeon PRO™, and AMD Radeon™ GPU architecture information
|
||||
:keywords: Instinct, Radeon, accelerator, CDNA, GPU, architecture, VRAM, Compute Units, Cache, Registers, LDS, Register File
|
||||
|
||||
Accelerator and GPU hardware specifications
|
||||
######################################################
|
||||
|
||||
The following tables provide an overview of the hardware specifications for AMD Instinct™ accelerators, and AMD Radeon™ PRO and Radeon™ GPUs.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: AMD Instinct accelerators
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:name: instinct-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
- LDS (KiB)
|
||||
- L3 Cache (MiB)
|
||||
- L2 Cache (MiB)
|
||||
- L1 Vector Cache (KiB)
|
||||
- L1 Scalar Cache (KiB)
|
||||
- L1 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
*
|
||||
- MI300X
|
||||
- CDNA3
|
||||
- gfx941 or gfx942
|
||||
- 192
|
||||
- 304
|
||||
- 64
|
||||
- 64
|
||||
- 256
|
||||
- 32
|
||||
- 32
|
||||
- 16 per 2 CUs
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
*
|
||||
- MI300A
|
||||
- CDNA3
|
||||
- gfx940 or gfx942
|
||||
- 128
|
||||
- 228
|
||||
- 64
|
||||
- 64
|
||||
- 256
|
||||
- 24
|
||||
- 32
|
||||
- 16 per 2 CUs
|
||||
- 64 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
*
|
||||
- MI250X
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 128
|
||||
- 220 (110 per GCD)
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 16 (8 per GCD)
|
||||
- 16
|
||||
- 16 per 2 CUs
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
*
|
||||
- MI250
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 128
|
||||
- 208
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 16 (8 per GCD)
|
||||
- 16
|
||||
- 16 per 2 CUs
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
*
|
||||
- MI210
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 64
|
||||
- 104
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 8
|
||||
- 16
|
||||
- 16 per 2 CUs
|
||||
- 32 per 2 CUs
|
||||
- 512
|
||||
- 12.5
|
||||
*
|
||||
- MI100
|
||||
- CDNA
|
||||
- gfx908
|
||||
- 32
|
||||
- 120
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 8
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256 VGPR and 256 AccVGPR
|
||||
- 12.5
|
||||
*
|
||||
- MI60
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 32
|
||||
- 64
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 4
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
*
|
||||
- MI50 (32GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 32
|
||||
- 60
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 4
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
*
|
||||
- MI50 (16GB)
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 4
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
*
|
||||
- MI25
|
||||
- GCN5.0
|
||||
- gfx900
|
||||
- 16
|
||||
- 64
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 4
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
*
|
||||
- MI8
|
||||
- GCN3.0
|
||||
- gfx803
|
||||
- 4
|
||||
- 64
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 2
|
||||
- 16
|
||||
- 16 per 4 CUs
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
*
|
||||
- MI6
|
||||
- GCN4.0
|
||||
- gfx803
|
||||
- 16
|
||||
- 36
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 2
|
||||
- 16
|
||||
- 16 per 4 CUs
|
||||
- 32 per 4 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
|
||||
.. tab-item:: AMD Radeon PRO GPUs
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:name: radeon-pro-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
- LDS (KiB)
|
||||
- Infinity Cache (MiB)
|
||||
- L2 Cache (MiB)
|
||||
- Graphics L1 Cache (KiB)
|
||||
- L0 Vector Cache (KiB)
|
||||
- L0 Scalar Cache (KiB)
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
*
|
||||
- Radeon PRO W7900
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 48
|
||||
- 96
|
||||
- 32
|
||||
- 128
|
||||
- 96
|
||||
- 6
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon PRO W7800
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 32
|
||||
- 70
|
||||
- 32
|
||||
- 128
|
||||
- 64
|
||||
- 6
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon PRO W7700
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 16
|
||||
- 48
|
||||
- 32
|
||||
- 128
|
||||
- 64
|
||||
- 4
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon PRO W6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 32
|
||||
- 60
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon PRO W6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon PRO V620
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 32
|
||||
- 72
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon Pro W5500
|
||||
- RDNA
|
||||
- gfx1012
|
||||
- 8
|
||||
- 22
|
||||
- 32
|
||||
- 128
|
||||
-
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon Pro VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
- 64
|
||||
-
|
||||
- 4
|
||||
-
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
|
||||
.. tab-item:: AMD Radeon GPUs
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:name: radeon-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
- LDS (KiB)
|
||||
- Infinity Cache (MiB)
|
||||
- L2 Cache (MiB)
|
||||
- Graphics L1 Cache (KiB)
|
||||
- L0 Vector Cache (KiB)
|
||||
- L0 Scalar Cache (KiB)
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
*
|
||||
- Radeon RX 7900 XTX
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 24
|
||||
- 96
|
||||
- 32
|
||||
- 128
|
||||
- 96
|
||||
- 6
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 7900 XT
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 20
|
||||
- 84
|
||||
- 32
|
||||
- 128
|
||||
- 80
|
||||
- 6
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 7900 GRE
|
||||
- RDNA3
|
||||
- gfx1100
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
- 128
|
||||
- 64
|
||||
- 6
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 7800 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
- 128
|
||||
- 64
|
||||
- 4
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 7700 XT
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 12
|
||||
- 54
|
||||
- 32
|
||||
- 128
|
||||
- 48
|
||||
- 4
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 384
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 7600
|
||||
- RDNA3
|
||||
- gfx1102
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6950 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6900 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 16
|
||||
- 80
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6800 XT
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 16
|
||||
- 72
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6800
|
||||
- RDNA2
|
||||
- gfx1030
|
||||
- 16
|
||||
- 60
|
||||
- 32
|
||||
- 128
|
||||
- 128
|
||||
- 4
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6750 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
- 128
|
||||
- 96
|
||||
- 3
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6700 XT
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 12
|
||||
- 40
|
||||
- 32
|
||||
- 128
|
||||
- 96
|
||||
- 3
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6700
|
||||
- RDNA2
|
||||
- gfx1031
|
||||
- 10
|
||||
- 36
|
||||
- 32
|
||||
- 128
|
||||
- 80
|
||||
- 3
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6650 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6600 XT
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 8
|
||||
- 32
|
||||
- 32
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon RX 6600
|
||||
- RDNA2
|
||||
- gfx1032
|
||||
- 8
|
||||
- 28
|
||||
- 32
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 128
|
||||
- 16
|
||||
- 16
|
||||
- 32
|
||||
- 256
|
||||
- 20
|
||||
*
|
||||
- Radeon VII
|
||||
- GCN5.1
|
||||
- gfx906
|
||||
- 16
|
||||
- 60
|
||||
- 64
|
||||
- 64 per CU
|
||||
-
|
||||
- 4
|
||||
-
|
||||
- 16
|
||||
- 16 per 3 CUs
|
||||
- 32 per 3 CUs
|
||||
- 256
|
||||
- 12.5
|
||||
|
||||
For more information on the terms used here, see the :ref:`specific documents and guides <gpu-arch-documentation>`, the :doc:`conceptual overview of the HIP programming model<hip:understand/programming_model>`, or the :doc:`HIP reference guide<hip:reference/programming_model>`.
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
.. meta::
|
||||
:description: GPU Architecture reference
|
||||
:keywords: AMD, GPU, architecture, hardware, CDNA, Instinct, reference
|
||||
|
||||
.. _gpu-arch-reference:
|
||||
|
||||
GPU architecture reference
|
||||
##########################
|
||||
|
||||
General overview
|
||||
""""""""""""""""
|
||||
|
||||
* :doc:`GPU architecture hardware specifications overview<gpu-arch/gpu-arch-spec-overview>`
|
||||
@@ -1,241 +0,0 @@
|
||||
.. meta::
|
||||
:description: AMD Instinct™ GPU architecture information
|
||||
:keywords: Instinct, CDNA, GPU, architecture, VRAM, Compute Units, Cache, Registers, LDS, Register File
|
||||
|
||||
GPU architecture hardware specifications
|
||||
########################################
|
||||
|
||||
The following table provides an overview over the hardware specifications for the AMD Instinct accelerators.
|
||||
|
||||
.. list-table:: AMD Instinct architecture specification table
|
||||
:header-rows: 1
|
||||
:name: instinct-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
- LDS
|
||||
- L3 Cache
|
||||
- L2 Cache
|
||||
- L1 Vector Cache
|
||||
- L1 Scalar Cache
|
||||
- L1 Instruction Cache
|
||||
- VGPR File
|
||||
- SGPR File
|
||||
*
|
||||
- MI300X
|
||||
- CDNA3
|
||||
- gfx941 or gfx942
|
||||
- 192 GiB
|
||||
- 304
|
||||
- 64
|
||||
- 64 KiB
|
||||
- 256 MiB
|
||||
- 32 MiB
|
||||
- 32 KiB
|
||||
- 16 KiB per 2 CUs
|
||||
- 64 KiB per 2 CUs
|
||||
- 512 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI300A
|
||||
- CDNA3
|
||||
- gfx940 or gfx942
|
||||
- 128 GiB
|
||||
- 228
|
||||
- 64
|
||||
- 64 KiB
|
||||
- 256 MiB
|
||||
- 24 MiB
|
||||
- 32 KiB
|
||||
- 16 KiB per 2 CUs
|
||||
- 64 KiB per 2 CUs
|
||||
- 512 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI250X
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 128 GiB
|
||||
- 220 (110 per GCD)
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 16 MiB (8 MiB per GCD)
|
||||
- 16 KiB
|
||||
- 16 KiB per 2 CUs
|
||||
- 32 KiB per 2 CUs
|
||||
- 512 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI250
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 128 GiB
|
||||
- 208
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 16 MiB (8 MiB per GCD)
|
||||
- 16 KiB
|
||||
- 16 KiB per 2 CUs
|
||||
- 32 KiB per 2 CUs
|
||||
- 512 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI210
|
||||
- CDNA2
|
||||
- gfx90a
|
||||
- 64 GiB
|
||||
- 104
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 8 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 2 CUs
|
||||
- 32 KiB per 2 CUs
|
||||
- 512 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI100
|
||||
- CDNA
|
||||
- gfx908
|
||||
- 32 GiB
|
||||
- 120
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 8 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 3 CUs
|
||||
- 32 KiB per 3 CUs
|
||||
- 256 KiB VGPR and 256 KiB AccVGPR
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI60
|
||||
- GCN 5.1
|
||||
- gfx906
|
||||
- 32 GiB
|
||||
- 64
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 4 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 3 CUs
|
||||
- 32 KiB per 3 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI50 (32GB)
|
||||
- GCN 5.1
|
||||
- gfx906
|
||||
- 32 GiB
|
||||
- 60
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 4 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 3 CUs
|
||||
- 32 KiB per 3 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI50 (16GB)
|
||||
- GCN 5.1
|
||||
- gfx906
|
||||
- 16 GiB
|
||||
- 60
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 4 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 3 CUs
|
||||
- 32 KiB per 3 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI25
|
||||
- GCN 5.0
|
||||
- gfx900
|
||||
- 16 GiB
|
||||
- 64
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 4 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 3 CUs
|
||||
- 32 KiB per 3 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI8
|
||||
- GCN 3.0
|
||||
- gfx803
|
||||
- 4 GiB
|
||||
- 64
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 2 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 4 CUs
|
||||
- 32 KiB per 4 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
*
|
||||
- MI6
|
||||
- GCN 4.0
|
||||
- gfx803
|
||||
- 16 GiB
|
||||
- 36
|
||||
- 64
|
||||
- 64 KiB
|
||||
-
|
||||
- 2 MiB
|
||||
- 16 KiB
|
||||
- 16 KiB per 4 CUs
|
||||
- 32 KiB per 4 CUs
|
||||
- 256 KiB
|
||||
- 12.5 KiB
|
||||
|
||||
Glossary
|
||||
########
|
||||
|
||||
For a more detailed explanation refer to the :ref:`specific documents and guides <gpu-arch-documentation>`.
|
||||
|
||||
LLVM target name
|
||||
Argument to pass to clang in `--offload-arch` to compile code for the given architecture.
|
||||
VRAM
|
||||
Amount of memory available on the GPU.
|
||||
Compute Units
|
||||
Number of compute units on the GPU.
|
||||
Wavefront Size
|
||||
Amount of work-items that execute in parallel on a single compute unit. This is equivalent to the warp size in HIP.
|
||||
LDS
|
||||
The Local Data Share (LDS) is a low-latency, high-bandwidth scratch pad memory. It is local to the compute units, shared by all work-items in a work group. In HIP this is the shared memory, which is shared by all threads in a block.
|
||||
L3 Cache
|
||||
Size of the level 3 cache. Shared by all compute units on the same GPU. Caches vector and scalar data and instructions.
|
||||
L2 Cache
|
||||
Size of the level 3 cache. Shared by all compute units on the same GCD. Caches vector and scalar data and instructions.
|
||||
L1 Vector Cache
|
||||
Size of the level 1 vector data cache. Local to a compute unit. Caches vector data.
|
||||
L1 Scalar Cache
|
||||
Size of the level 1 scalar data cache. Usually shared by several compute units. Caches scalar data.
|
||||
L1 Instruction Cache
|
||||
Size of the level 1 instruction cache. Usually shared by several compute units.
|
||||
VGPR File
|
||||
Size of the Vector General Purpose Register (VGPR) file. Holds data used in vector instructions.
|
||||
GPUs with matrix cores also have AccVGPRs, which are Accumulation General Purpose Vector Registers, specifically used in matrix instructions.
|
||||
SGPR File
|
||||
Size of the Scalar General Purpose Register (SGPR) file. Holds data used in scalar instructions.
|
||||
GCD
|
||||
Graphics Compute Die.
|
||||
@@ -11,6 +11,8 @@
|
||||
::::{grid} 1 2 2 2
|
||||
:class-container: rocm-doc-grid
|
||||
|
||||
(development-tools)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-development.jpg
|
||||
@@ -23,6 +25,8 @@
|
||||
* {doc}`ROCm debugger (ROCgdb) <rocgdb:index>`
|
||||
:::
|
||||
|
||||
(performance-tools)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-performance.jpg
|
||||
@@ -34,6 +38,8 @@
|
||||
* {doc}`ROCTracer <roctracer:index>`
|
||||
:::
|
||||
|
||||
(system-tools)=
|
||||
|
||||
:::{grid-item-card}
|
||||
:class-card: sd-text-black
|
||||
:img-top: ../data/reference/banner-system.jpg
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
| Version | Release date |
|
||||
| ------- | ------------ |
|
||||
| [6.1.0](https://rocm.docs.amd.com/en/docs-6.1.0/) | Apr 16, 2024 |
|
||||
| [6.0.2](https://rocm.docs.amd.com/en/docs-6.0.2/) | Jan 31, 2024 |
|
||||
| [6.0.0](https://rocm.docs.amd.com/en/docs-6.0.0/) | Dec 15, 2023 |
|
||||
| [5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) | Oct 13, 2023 |
|
||||
|
||||
@@ -8,7 +8,7 @@ subtrees:
|
||||
- entries:
|
||||
- file: what-is-rocm.rst
|
||||
- file: about/release-notes.md
|
||||
title: Release notes
|
||||
title: Release highlights
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: about/CHANGELOG.md
|
||||
@@ -29,7 +29,7 @@ subtrees:
|
||||
title: Linux
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
|
||||
title: Windows
|
||||
- file: about/compatibility/data-type-support.rst
|
||||
- file: about/compatibility/precision-support.rst
|
||||
title: Precision support
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/3rd-party-support-matrix.html
|
||||
title: Third-party
|
||||
@@ -40,14 +40,10 @@ subtrees:
|
||||
title: API libraries
|
||||
- file: reference/rocm-tools.md
|
||||
title: Tools
|
||||
- file: reference/gpu-arch.rst
|
||||
title: GPU architectures
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: reference/gpu-arch/gpu-arch-spec-overview.rst
|
||||
title: Hardware specifications overview
|
||||
- file: reference/gpu-arch-specs.rst
|
||||
title: Hardware specifications
|
||||
|
||||
- caption: How-to
|
||||
- caption: How to
|
||||
entries:
|
||||
- file: how-to/deep-learning-rocm.md
|
||||
title: Deep learning
|
||||
@@ -71,7 +67,7 @@ subtrees:
|
||||
- caption: Conceptual
|
||||
entries:
|
||||
- file: conceptual/gpu-arch.md
|
||||
title: GPU architectures
|
||||
title: GPU architecture overview
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: conceptual/gpu-arch/mi300.md
|
||||
@@ -102,6 +98,8 @@ subtrees:
|
||||
title: White paper
|
||||
- file: conceptual/gpu-memory.md
|
||||
title: GPU memory
|
||||
- file: conceptual/setting-cus
|
||||
title: Setting the number of CUs
|
||||
- file: conceptual/compiler-disambiguation.md
|
||||
title: Compiler disambiguation
|
||||
- file: about/compatibility/openmp.md
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
rocm-docs-core==0.35.1
|
||||
rocm-docs-core==1.0.0
|
||||
sphinx-reredirects==0.1.3
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
@@ -42,14 +42,10 @@ gitdb==4.0.10
|
||||
# via gitpython
|
||||
gitpython==3.1.41
|
||||
# via rocm-docs-core
|
||||
idna==3.4
|
||||
idna==3.7
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==7.0.0
|
||||
# via sphinx
|
||||
importlib-resources==6.1.1
|
||||
# via rocm-docs-core
|
||||
jinja2==3.1.3
|
||||
# via
|
||||
# myst-parser
|
||||
@@ -84,9 +80,7 @@ pygments==2.15.0
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.6.0
|
||||
# via
|
||||
# pygithub
|
||||
# pyjwt
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
pytz==2022.7.1
|
||||
@@ -100,7 +94,7 @@ requests==2.31.0
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==0.35.1
|
||||
rocm-docs-core==1.0.0
|
||||
# via -r requirements.in
|
||||
smmap==5.0.0
|
||||
# via gitdb
|
||||
@@ -119,6 +113,7 @@ sphinx==5.3.0
|
||||
# sphinx-design
|
||||
# sphinx-external-toc
|
||||
# sphinx-notfound-page
|
||||
# sphinx-reredirects
|
||||
sphinx-book-theme==1.0.1
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.1
|
||||
@@ -129,6 +124,8 @@ sphinx-external-toc==0.3.1
|
||||
# via rocm-docs-core
|
||||
sphinx-notfound-page==0.8.3
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.3
|
||||
# via -r requirements.in
|
||||
sphinxcontrib-applehelp==1.0.4
|
||||
# via sphinx
|
||||
sphinxcontrib-devhelp==1.0.2
|
||||
@@ -147,7 +144,3 @@ urllib3==1.26.13
|
||||
# via requests
|
||||
wrapt==1.14.1
|
||||
# via deprecated
|
||||
zipp==3.17.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
.. meta::
|
||||
:description: What is ROCm
|
||||
:keywords: ROCm projects, introduction, ROCm, AMD, runtimes, compilers, tools, libraries, API
|
||||
:keywords: ROCm components, ROCm projects, introduction, ROCm, AMD, runtimes, compilers, tools, libraries, API
|
||||
|
||||
***********************************************************
|
||||
What is ROCm?
|
||||
@@ -10,8 +10,13 @@ ROCm is an open-source stack, composed primarily of open-source software, design
|
||||
graphics processing unit (GPU) computation. ROCm consists of a collection of drivers, development
|
||||
tools, and APIs that enable GPU programming from low-level kernel to end-user applications.
|
||||
|
||||
.. image:: data/rocm-software-stack-6_1_0.jpg
|
||||
:width: 800
|
||||
:alt: AMD's ROCm software stack and neighboring technologies.
|
||||
:align: center
|
||||
|
||||
ROCm is powered by
|
||||
`Heterogeneous-computing Interface for Portability (HIP) <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_;
|
||||
:doc:`Heterogeneous-computing Interface for Portability (HIP) <hip:index>`;
|
||||
it supports programming models, such as OpenMP and OpenCL, and includes all necessary open
|
||||
source software compilers, debuggers, and libraries. It's fully integrated into machine learning (ML)
|
||||
frameworks, such as PyTorch and TensorFlow.
|
||||
@@ -20,63 +25,112 @@ frameworks, such as PyTorch and TensorFlow.
|
||||
If you're using Radeon GPUs, refer to the
|
||||
:doc:`Radeon-specific ROCm documentation <radeon:index>`.
|
||||
|
||||
ROCm project list
|
||||
ROCm components
|
||||
===============================================
|
||||
|
||||
ROCm consists of the following projects. For information on the license associated with each project,
|
||||
ROCm consists of the following components. For information on the license associated with each component,
|
||||
see :doc:`ROCm licensing <./about/license>`.
|
||||
|
||||
.. csv-table::
|
||||
:header: "Project", "Type", "Description"
|
||||
Libraries
|
||||
-----------------------------------------------
|
||||
|
||||
"`AMD Compute Language Runtimes (CLR) <https://github.com/ROCm/clr>`_", "Runtime", "Contains source code for AMD's compute languages runtimes: :doc:`HIP <hip:index>` and OpenCL"
|
||||
":doc:`AMD SMI <amdsmi:index>`", "Tool", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
|
||||
"`AOMP <https://github.com/ROCm/aomp/>`_", "Compiler", "Scripted build of `LLVM <https://github.com/ROCm/llvm-project>`_ and supporting software"
|
||||
":doc:`Composable Kernel <composable_kernel:index>`", "Library (AI/ML)", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
|
||||
"`FLANG <https://github.com/ROCm/flang/>`_", "Compiler", "An out-of-tree Fortran compiler targeting LLVM"
|
||||
"`half <https://github.com/ROCm/half/>`_", "Library (math)", "C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type, along with corresponding arithmetic operators, type conversions, and common mathematical functions"
|
||||
":doc:`HIP <hip:index>`", "Runtime", AMD's GPU programming language extension and the GPU runtime"
|
||||
":doc:`hipBLAS <hipblas:index>`", "Library (math)", "BLAS-marshaling library that supports `rocBLAS <https://rocm.docs.amd.com/projects/rocBLAS/en/latest/>`_ and cuBLAS backends"
|
||||
":doc:`hipBLASLt <hipblaslt:index>`", "Library (math)", "Provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library"
|
||||
"`hipCC <https://github.com/ROCm/HIPCC>`_ ", "Compiler", "Compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure"
|
||||
":doc:`hipCUB <hipcub:index>`", "Library (C++ primitive)", "Thin header-only wrapper library on top of `rocPRIM <https://rocm.docs.amd.com/projects/rocPRIM/en/latest/>`_ or CUB that allows project porting using the CUB library to the HIP layer"
|
||||
":doc:`hipFFT <hipfft:index>`", "Library (math)", "Fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends"
|
||||
":doc:`hipfort <hipfort:index>`", "Library (math)", "Fortran interface library for accessing GPU Kernels"
|
||||
":doc:`HIPIFY <hipify:index>`", "Compiler", "Translates CUDA source code into portable HIP C++"
|
||||
":doc:`hipRAND <hiprand:index>`", "Library (math)", "Ports CUDA applications that use the cuRAND library into the HIP layer"
|
||||
":doc:`hipSOLVER <hipsolver:index>`", "Library (math)", "An LAPACK-marshalling library that supports `rocSOLVER <https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/>`_ and cuSOLVER backends"
|
||||
":doc:`hipSPARSE <hipsparse:index>`", "Library (math)", "SPARSE-marshalling library that supports `rocSPARSE <https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/>`_ and cuSPARSE backends"
|
||||
":doc:`hipSPARSELt <hipsparselt:index>`", "Library (math)", "SPARSE-marshalling library with multiple supported backends"
|
||||
":doc:`hipTensor <hiptensor:index>`", "Library (C++ primitive)", "AMD's C++ library for accelerating tensor primitives based on the composable kernel library"
|
||||
"`LLVM (amdclang) <https://github.com/ROCm/llvm-project>`_ ", "Compiler", "Toolkit for the construction of highly optimized compilers, optimizers, and run-time environments"
|
||||
":doc:`MIGraphX <amdmigraphx:index>`", "Library (AI/ML)", "Graph inference engine that accelerates machine learning model inference"
|
||||
":doc:`MIOpen <miopen:index>`", "Library (AI/ML)", "An open source deep-learning library"
|
||||
":doc:`MIVisionX <mivisionx:doxygen/html/index>`", "Library (AI/ML)", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
|
||||
"`Radeon Compute Profiler (RCP) <https://github.com/GPUOpen-Tools/radeon_compute_profiler/>`_ ", "Tool", "Performance analysis tool that gathers data from the API run-time and GPU for OpenCL and ROCm/HSA applications"
|
||||
":doc:`RCCL <rccl:index>`", "Library (communication)", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
|
||||
":doc:`rocAL <rocal:index>`", "Library (AI/ML)", "An augmentation library designed to decode and process images and videos"
|
||||
":doc:`rocALUTION <rocalution:index>`", "Library (math)", "Sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains"
|
||||
"`RocBandwidthTest <https://github.com/ROCm/rocm_bandwidth_test/>`_ ", "Tool", "Captures the performance characteristics of buffer copying and kernel read/write operations"
|
||||
":doc:`rocBLAS <rocblas:index>`", "Library (math)", "BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains"
|
||||
":doc:`rocFFT <rocfft:index>`", "Library (math)", "Software library for computing fast Fourier transforms (FFTs) written in HIP"
|
||||
":doc:`ROCmCC <./reference/rocmcc>`", "Tool", "Clang/LLVM-based compiler"
|
||||
"`ROCm CMake <https://github.com/ROCm/rocm-cmake>`_ ", "Tool", "Collection of CMake modules for common build and development tasks"
|
||||
":doc:`ROCm Data Center Tool <rdc:index>`", "Tool", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
|
||||
"`ROCm Debug Agent (ROCdebug-agent) <https://github.com/ROCm/rocr_debug_agent/>`_ ", "Tool", "Prints the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running"
|
||||
":doc:`ROCm debugger (ROCgdb) <rocgdb:index>`", "Tool", "Source-level debugger for Linux, based on the GNU Debugger (GDB)"
|
||||
":doc:`ROCdbgapi <rocdbgapi:index>`", "Tool", "ROCm debugger API library"
|
||||
"`rocminfo <https://github.com/ROCm/rocminfo/>`_ ", "Tool", "Reports system information"
|
||||
":doc:`ROCm Performance Primitives (RPP) <rpp:index>`", "Library (AI/ML)", "Comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends"
|
||||
":doc:`ROCm SMI <rocm_smi_lib:index>`", "Tool", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
|
||||
":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Tool", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
|
||||
":doc:`rocPRIM <rocprim:index>`", "Library (C++ primitive)", "Header-only library for HIP parallel primitives"
|
||||
":doc:`ROCProfiler <rocprofiler:profiler_home_page>`", "Tool", "Profiling tool for HIP applications"
|
||||
":doc:`rocRAND <rocrand:index>`", "Library (math)", "Provides functions that generate pseudorandom and quasirandom numbers"
|
||||
"`ROCR-Runtime <https://github.com/ROCm/ROCR-Runtime/>`_ ", "Runtime", "User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents"
|
||||
":doc:`rocSOLVER <rocsolver:index>`", "Library (math)", "An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD's latest discrete GPUs"
|
||||
":doc:`rocSPARSE <rocsparse:index>`", "Library (math)", "Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language)"
|
||||
":doc:`rocThrust <rocthrust:index>`", "Library (C++ primitive)", "Parallel algorithm library"
|
||||
":doc:`ROCTracer <roctracer:index>`", "Tool", "Intercepts runtime API calls and traces asynchronous activity"
|
||||
":doc:`rocWMMA <rocwmma:index>`", "Library (math)", "C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations"
|
||||
"`Tensile <https://github.com/ROCm/Tensile>`_ ", "Library (math)", "Creates benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions"
|
||||
":doc:`TransferBench <transferbench:index>`", "Tool", "Utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs)"
|
||||
Machine Learning & Computer Vision
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
|
||||
":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
|
||||
":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
|
||||
":doc:`MIVisionX <mivisionx:doxygen/html/index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
|
||||
":doc:`rocAL <rocal:index>`", "An augmentation library designed to decode and process images and videos"
|
||||
":doc:`rocDecode <rocdecode:index>`", "High-performance SDK for access to video decoding features on AMD GPUs"
|
||||
":doc:`ROCm Performance Primitives (RPP) <rpp:index>`", "Comprehensive high-performance computer vision library for AMD processors with HIP/OpenCL/CPU back-ends"
|
||||
|
||||
Communication
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
":doc:`RCCL <rccl:index>`", "Standalone library that provides multi-GPU and multi-node collective communication primitives"
|
||||
|
||||
Math
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
"`half <https://github.com/ROCm/half/>`_", "C++ header-only library that provides an IEEE 754 conformant, 16-bit half-precision floating-point type, along with corresponding arithmetic operators, type conversions, and common mathematical functions"
|
||||
":doc:`hipBLAS <hipblas:index>`", "BLAS-marshaling library that supports :doc:`rocBLAS <rocblas:index>` and cuBLAS backends"
|
||||
":doc:`hipBLASLt <hipblaslt:index>`", "Provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library"
|
||||
":doc:`hipFFT <hipfft:index>`", "Fast Fourier transforms (FFT)-marshalling library that supports rocFFT or cuFFT backends"
|
||||
":doc:`hipfort <hipfort:index>`", "Fortran interface library for accessing GPU Kernels"
|
||||
":doc:`hipRAND <hiprand:index>`", "Ports CUDA applications that use the cuRAND library into the HIP layer"
|
||||
":doc:`hipSOLVER <hipsolver:index>`", "An LAPACK-marshalling library that supports :doc:`rocSOLVER <rocsolver:index>` and cuSOLVER backends"
|
||||
":doc:`hipSPARSE <hipsparse:index>`", "SPARSE-marshalling library that supports :doc:`rocSPARSE <rocsparse:index>` and cuSPARSE backends"
|
||||
":doc:`hipSPARSELt <hipsparselt:index>`", "SPARSE-marshalling library with multiple supported backends"
|
||||
":doc:`rocALUTION <rocalution:index>`", "Sparse linear algebra library for exploring fine-grained parallelism on ROCm runtime and toolchains"
|
||||
":doc:`rocBLAS <rocblas:index>`", "BLAS implementation (in the HIP programming language) on the ROCm runtime and toolchains"
|
||||
":doc:`rocFFT <rocfft:index>`", "Software library for computing fast Fourier transforms (FFTs) written in HIP"
|
||||
":doc:`rocRAND <rocrand:index>`", "Provides functions that generate pseudorandom and quasirandom numbers"
|
||||
":doc:`rocSOLVER <rocsolver:index>`", "An implementation of LAPACK routines on ROCm software, implemented in the HIP programming language and optimized for AMD's latest discrete GPUs"
|
||||
":doc:`rocSPARSE <rocsparse:index>`", "Exposes a common interface that provides BLAS for sparse computation implemented on ROCm runtime and toolchains (in the HIP programming language)"
|
||||
":doc:`rocWMMA <rocwmma:index>`", "C++ library for accelerating mixed-precision matrix multiply-accumulate (MMA) operations"
|
||||
"`Tensile <https://github.com/ROCm/Tensile>`_ ", "Creates benchmark-driven backend libraries for GEMMs, GEMM-like problems, and general N-dimensional tensor contractions"
|
||||
|
||||
Primitives
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
":doc:`hipCUB <hipcub:index>`", "Thin header-only wrapper library on top of :doc:`rocPRIM <rocprim:index>` or CUB that allows project porting using the CUB library to the HIP layer"
|
||||
":doc:`hipTensor <hiptensor:index>`", "AMD's C++ library for accelerating tensor primitives based on the composable kernel library"
|
||||
":doc:`rocPRIM <rocprim:index>`", "Header-only library for HIP parallel primitives"
|
||||
":doc:`rocThrust <rocthrust:index>`", "Parallel algorithm library"
|
||||
|
||||
Tools
|
||||
-----------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
|
||||
":doc:`HIPIFY <hipify:index>`", "Translates CUDA source code into portable HIP C++"
|
||||
"`Radeon Compute Profiler (RCP) <https://github.com/GPUOpen-Tools/radeon_compute_profiler/>`_ ", "Performance analysis tool that gathers data from the API runtime and GPU for OpenCL and ROCm/HSA applications"
|
||||
"`RocBandwidthTest <https://github.com/ROCm/rocm_bandwidth_test/>`_ ", "Captures the performance characteristics of buffer copying and kernel read/write operations"
|
||||
":doc:`ROCmCC <./reference/rocmcc>`", "Clang/LLVM-based compiler"
|
||||
"`ROCm CMake <https://github.com/ROCm/rocm-cmake>`_ ", "Collection of CMake modules for common build and development tasks"
|
||||
":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
|
||||
"`ROCm Debug Agent (ROCdebug-agent) <https://github.com/ROCm/rocr_debug_agent/>`_ ", "Prints the state of all AMD GPU wavefronts that caused a queue error by sending a SIGQUIT signal to the process while the program is running"
|
||||
":doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`", "Source-level debugger for Linux, based on the GNU Debugger (GDB)"
|
||||
":doc:`ROCdbgapi <rocdbgapi:index>`", "ROCm debugger API library"
|
||||
"`rocminfo <https://github.com/ROCm/rocminfo/>`_ ", "Reports system information"
|
||||
":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
|
||||
":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
|
||||
":doc:`ROCProfiler <rocprofiler:profiler_home_page>`", "Profiling tool for HIP applications"
|
||||
":doc:`ROCTracer <roctracer:index>`", "Intercepts runtime API calls and traces asynchronous activity"
|
||||
":doc:`TransferBench <transferbench:index>`", "Utility to benchmark simultaneous transfers between user-specified devices (CPUs/GPUs)"
|
||||
|
||||
Compilers
|
||||
-----------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
"`AOMP <https://github.com/ROCm/aomp/>`_", "Scripted build of `LLVM <https://github.com/ROCm/llvm-project>`_ and supporting software"
|
||||
"`FLANG <https://github.com/ROCm/flang/>`_", "An out-of-tree Fortran compiler targeting LLVM"
|
||||
"`hipCC <https://github.com/ROCm/HIPCC>`_ ", "Compiler driver utility that calls Clang or NVCC and passes the appropriate include and library options for the target compiler and HIP infrastructure"
|
||||
"`LLVM (amdclang) <https://github.com/ROCm/llvm-project>`_ ", "Toolkit for the construction of highly optimized compilers, optimizers, and runtime environments"
|
||||
|
||||
Runtimes
|
||||
-----------------------------------------------
|
||||
|
||||
.. csv-table::
|
||||
:header: "Component", "Description"
|
||||
|
||||
"`AMD Common Language Runtime (CLR) <https://github.com/ROCm/clr>`_", "Contains source code for AMD's common language runtimes: :doc:`HIP <hip:index>` and OpenCL"
|
||||
":doc:`HIP <hip:index>`", "AMD's GPU programming language extension and the GPU runtime"
|
||||
"`ROCR-Runtime <https://github.com/ROCm/ROCR-Runtime/>`_ ", "User-mode API interfaces and libraries necessary for host applications to launch compute kernels on available HSA ROCm kernel agents"
|
||||
|
||||
@@ -13,21 +13,23 @@
|
||||
|
||||
## Updating the changelog
|
||||
|
||||
> IMPORTANT: It is key to update the template Markdown files in `tools/autotag/templates/rocm_changes` (eg: `5.6.0.md`) and not the `CHANGELOG.md` itself to ensure that updates are not overwritten by the autotag script. The template should only have content from changelogs that are not included by the script to avoid duplicating data.
|
||||
|
||||
* Add or update the release specific notes in `tools/autotag/templates/rocm_changes`
|
||||
* Ensure the all the repositories have their release specific branch with the updated changelogs.
|
||||
* Ensure the all the repositories have their release specific branch with the updated changelogs
|
||||
* Run this for 5.6.0 (change for whatever version you require)
|
||||
* `GITHUB_ACCESS_TOKEN=my_token_here`
|
||||
|
||||
To generate the changelog from 5.0.0 up to and including 6.0.1:
|
||||
To generate the changelog from 5.0.0 up to and including 6.1.0:
|
||||
|
||||
```sh
|
||||
python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --do-previous --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.0 6.0.1
|
||||
python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --do-previous --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.1 6.1.0
|
||||
```
|
||||
|
||||
To generate the changelog only for 6.0.1:
|
||||
To generate the changelog only for 6.1.0:
|
||||
|
||||
```sh
|
||||
python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.0 6.0.1
|
||||
python3 tag_script.py -t $GITHUB_ACCESS_TOKEN --no-release --no-pulls --compile_file ../../CHANGELOG.md --branch release/rocm-rel-6.1 6.1.0
|
||||
```
|
||||
|
||||
### Notes
|
||||
|
||||
@@ -84,11 +84,9 @@ class TaggingArgs(argparse.Namespace):
|
||||
"MIOpenGEMM",
|
||||
"MIOpenKernels",
|
||||
"MIOpenTensile",
|
||||
"ROCmValidationSuite",
|
||||
"half",
|
||||
"hipFORT",
|
||||
"rccl-rdma-sharp-plugins",
|
||||
"MLSEQA_TestRepo",
|
||||
"half",
|
||||
"rccl-rdma-sharp-plugins",
|
||||
]
|
||||
return defaults + (self._exclude if self._exclude is not None else [])
|
||||
|
||||
@@ -236,10 +234,16 @@ def run_tagging():
|
||||
|
||||
# Find all the math libraries and their remotes.
|
||||
included_names = [
|
||||
"rocm-cmake",
|
||||
"MIOpen",
|
||||
"AMDMIGraphX",
|
||||
"rocprofiler"
|
||||
"HIPIFY", #
|
||||
"MIOpen",
|
||||
"MIVisionX",
|
||||
"ROCmValidationSuite", #
|
||||
"composable_kernel",
|
||||
"hipfort",
|
||||
"rocDecode",
|
||||
"rocm-cmake",
|
||||
"rpp",
|
||||
]
|
||||
included_groups = [
|
||||
"mathlibs"
|
||||
|
||||
@@ -27,12 +27,12 @@ This page contains the release notes for AMD ROCm Software.
|
||||
{%- set rocm_changes = "./rocm_changes/" ~ version ~ ".md" %}
|
||||
{% include rocm_changes ignore missing %}
|
||||
|
||||
### Library changes in ROCM {{version}}
|
||||
### Library changes in ROCm {{version}}
|
||||
|
||||
| Library | Version |
|
||||
|---------|---------|
|
||||
{%- for lib_name, lib in release.libraries | dictsort %}
|
||||
{%- if rocm_ver_by_lib_ver[lib_name][lib.lib_version] == version and lib.lib_version %}
|
||||
{%- if rocm_ver_by_lib_ver[lib_name][lib.lib_version] == version and (prev_lib_ver[lib_name][lib.lib_version] | default([]) | length > 0) and lib.lib_version %}
|
||||
| {{ lib_name }} | {{prev_lib_ver[lib_name][lib.lib_version]}} ⇒ [{{ lib.lib_version }}]({{ lib.release_url }}) |
|
||||
{%- elif lib.lib_version %}
|
||||
| {{ lib_name }} | [{{ lib.lib_version }}]({{ lib.release_url }}) |
|
||||
@@ -42,7 +42,7 @@ This page contains the release notes for AMD ROCm Software.
|
||||
{%- for lib_name, lib in release.libraries | dictsort %}
|
||||
{%- if rocm_ver_by_lib_ver[lib_name][lib.lib_version] == version and lib.lib_version%}
|
||||
|
||||
#### {{lib_name}} {{lib.lib_version}}
|
||||
#### {{lib_name}}
|
||||
|
||||
{{lib.message}}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ kernels found by setting the environment variable ROCBLAS_TENSILE_GEMM_OVERRIDE_
|
||||
points to the stored file.
|
||||
|
||||
For more details, refer to the
|
||||
[rocBLAS Programmer's Guide](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/Programmers_Guide.html#rocblas-gemm-tune).
|
||||
[rocBLAS Programmer's Guide](https://rocm.docs.amd.com/projects/rocBLAS/en/docs-5.7.1/Programmers_Guide.html).
|
||||
|
||||
#### HIP 5.7.1 (for ROCm 5.7.1)
|
||||
|
||||
|
||||
319
tools/autotag/templates/rocm_changes/6.1.0.md
Normal file
319
tools/autotag/templates/rocm_changes/6.1.0.md
Normal file
@@ -0,0 +1,319 @@
|
||||
|
||||
The ROCm™ 6.1 release consists of new features and fixes to improve the stability and
|
||||
performance of AMD Instinct™ MI300 GPU applications. Notably, we've added:
|
||||
|
||||
* Full support for Ubuntu 22.04.4.
|
||||
|
||||
* **rocDecode**, a new ROCm component that provides high-performance video decode support for
|
||||
AMD GPUs. With rocDecode, you can decode compressed video streams while keeping the resulting
|
||||
YUV frames in video memory. With decoded frames in video memory, you can run video
|
||||
post-processing using ROCm HIP, avoiding unnecessary data copies via the PCIe bus.
|
||||
|
||||
To learn more, refer to the rocDecode
|
||||
[documentation](https://rocm.docs.amd.com/projects/rocDecode/en/latest/).
|
||||
|
||||
### OS and GPU support changes
|
||||
|
||||
ROCm 6.1 adds the following operating system support:
|
||||
|
||||
* MI300A: Ubuntu 22.04.4 and RHEL 9.3
|
||||
* MI300X: Ubuntu 22.04.4
|
||||
|
||||
Future releases will add additional operating systems to match our general offering. For older
|
||||
generations of supported AMD Instinct products, we’ve added Ubuntu 22.04.4 support.
|
||||
|
||||
```{tip}
|
||||
To view the complete list of supported GPUs and operating systems, refer to the system requirements
|
||||
page for
|
||||
[Linux](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
|
||||
and
|
||||
[Windows](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html).
|
||||
```
|
||||
|
||||
### Installation packages
|
||||
|
||||
This release includes a new set of packages for every module (all libraries and binaries default to
|
||||
`DT_RPATH`). Package names have the suffix `rpath`; for example, the `rpath` variant of `rocminfo` is
|
||||
`rocminfo-rpath`.
|
||||
|
||||
```{warning}
|
||||
The new `rpath` packages will conflict with the default packages; they are meant to be used only in
|
||||
environments where legacy `DT_RPATH` is the preferred form of linking (instead of `DT_RUNPATH`). We
|
||||
do **not** recommend trying to install both sets of packages.
|
||||
```
|
||||
|
||||
#### AMD SMI
|
||||
|
||||
AMD SMI for ROCm 6.1.0
|
||||
|
||||
##### Additions
|
||||
|
||||
* **Added Monitor command**. This provides users the ability to customize GPU metrics to capture,
|
||||
collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi`
|
||||
(no argument), and allows you to customize per the data that are helpful for your use-case.
|
||||
|
||||
* **Integrated ESMI Tool**. You can get CPU metrics and telemetry through our API and CLI tools.
|
||||
You can get this information using the `amd-smi static` and `amd-smi metric` commands. This is only
|
||||
available for limited target processors. As of ROCm 6.0.2, this is listed as:
|
||||
* AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh
|
||||
* AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh
|
||||
|
||||
* **Added support for new metrics: VCN, JPEG engines, and PCIe errors**. Using the AMD SMIrccl
|
||||
tool, you can retrieve VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or
|
||||
`amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs
|
||||
(with 4 separate VCN engine activities) for older ASICs `MM_ACTIVITY` with UVD/VCN engine activity
|
||||
(average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up
|
||||
to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new
|
||||
fields.
|
||||
|
||||
* **Added AMDSMI Tool version**. AMD SMI will report *three versions*: AMDSMI Tool, AMDSMI
|
||||
Library version, and ROCm version.
|
||||
|
||||
The AMDSMI Tool version is the CLI/tool version number with commit ID appended after the `+` sign.
|
||||
The AMDSMI Library version is the library package version number. The ROCm version is the system's
|
||||
installed ROCm version; if ROCm is not installed, it reports N/A.
|
||||
|
||||
* **Added XGMI table**. Displays XGMI information for AMD GPU devices in a table format. This is
|
||||
only available on supported ASICs (e.g., MI300). Here, users can view read/write data XGMI or PCIe
|
||||
accumulated data transfer size (in KiloBytes).
|
||||
|
||||
* **Added units of measure to JSON output.**. We added unit of measure to JSON/CSV
|
||||
`amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands.
|
||||
|
||||
##### Changes
|
||||
|
||||
* **Topology is now left-aligned with BDF for each device listed individual table's row/columns**.
|
||||
We provided each device's BDF for every table's row/columns, then left-aligned data. We want AMD
|
||||
SMI Tool output to be easy to understand and digest. Having to scroll up to find this information
|
||||
made it difficult to follow, especially for devices that have many devices associated with one ASIC.
|
||||
|
||||
##### Fixes
|
||||
|
||||
* **Fix for RDNA3/RDNA2/MI100 'amdsmi_get_gpu_pci_bandwidth()' in 'frequencies_read' tests**.
|
||||
For devices that do not report (e.g., RDNA3/RDNA2/MI100), we have added checks to confirm that
|
||||
these devices return `AMDSMI_STATUS_NOT_SUPPORTED`. Otherwise, tests now display a return
|
||||
string.
|
||||
|
||||
* **Fix for devices that have an older PyYAML installed**. For platforms that are identified as having
|
||||
an older PyYAML version or pip, we now manually update both pip and PyYAML as needed. This
|
||||
fix impacts the following CLI commands:
|
||||
* `amd-smi list`
|
||||
* `amd-smi static`
|
||||
* `amd-smi firmware`
|
||||
* `amd-smi metric`
|
||||
* `amd-smi topology`
|
||||
|
||||
* **Fix for crash when user is not a member of video/render groups**. AMD SMI now uses the
|
||||
same mutex handler for devices as ROCm SMI. This helps avoid crashes when DRM/device data are
|
||||
inaccessible to the logged-in user.
|
||||
|
||||
##### Known issues
|
||||
|
||||
* There is an `AttributeError` while running `amd-smi process --csv`
|
||||
* GPU reset results in an "*Unable to reset non-amd GPU*" error
|
||||
* bad pages results with "ValueError: NULL pointer access"
|
||||
* Some RDNA3 cards may enumerate to `Slot type = UNKNOWN`
|
||||
|
||||
#### HIP
|
||||
|
||||
HIP 6.1 for ROCm 6.1
|
||||
|
||||
##### Additions
|
||||
|
||||
* New environment variable, `HIP_LAUNCH_BLOCKING`, which is used for serialization on kernel
|
||||
execution.
|
||||
* The default value is 0 (disable): kernel runs normally, as defined in the queue
|
||||
* When set as 1 (enable): HIP runtime serializes the kernel enqueue and behaves the same as
|
||||
`AMD_SERIALIZE_KERNEL`
|
||||
* Added HIPRTC support for hip headers `driver_types`, `math_functions`, `library_types`,
|
||||
`math_functions`, `hip_math_constants`, `channel_descriptor`, `device_functions`, `hip_complex`,
|
||||
`surface_types`, `texture_types`
|
||||
|
||||
##### Changes
|
||||
|
||||
* HIPRTC now assumes WGP mode for gfx10+. You can enable CU mode by passing `-mcumode` to the
|
||||
compile options from `hiprtcCompileProgram`.
|
||||
|
||||
##### Fixes
|
||||
|
||||
* HIP complex vector type multiplication and division operations.
|
||||
On an AMD platform, some duplicated complex operators are removed to avoid compilation failures.
|
||||
In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex datatypes:
|
||||
* `typedef float2 hipFloatComplex`
|
||||
* `typedef double2 hipDoubleComplex`
|
||||
|
||||
Any application that uses complex multiplication and division operations must replace `*` and `/`
|
||||
operators with the following:
|
||||
* `hipCmulf() and hipCdivf() for hipFloatComplex`
|
||||
* `hipCmul() and hipCdiv() for hipDoubleComplex`
|
||||
|
||||
Note that these complex operations are equivalent to corresponding types/functions on an NVIDIA
|
||||
platform.
|
||||
|
||||
#### HIPIFY
|
||||
|
||||
HIPIFY for ROCm 6.1.0
|
||||
|
||||
##### Additions
|
||||
|
||||
* CUDA 12.3.2 support
|
||||
* cuDNN 8.9.7 support
|
||||
* LLVM 17.0.6 support
|
||||
* Full `hipSOLVER` support
|
||||
* Full `rocSPARSE` support
|
||||
* New option: `--amap`, which will hipify as much as possible, ignoring `--default-preprocessor`
|
||||
behavior
|
||||
|
||||
##### Fixes
|
||||
|
||||
* Code blocks skipped by the preprocessor are no longer hipified under the `--default-preprocessor`
|
||||
option
|
||||
|
||||
#### ROCm Compiler
|
||||
|
||||
ROCm Compiler for ROCm 6.1.0
|
||||
|
||||
##### Additions
|
||||
|
||||
* Compiler now generates `.uniform_work_group_size` and records it in the metadata. It indicates if the
|
||||
kernel requires that each dimension of global size is a multiple of the corresponding dimension of
|
||||
work-group size. A value of 1 is true, and 0 is false. This metadata is only provided when the value is
|
||||
1.
|
||||
* Added the `rocm-llvm-docs` package.
|
||||
* Added ROCm Device-Libs, ROCm Compiler Support, and hipCC within the `llvm-project/amd`
|
||||
subdirectory to AMD’s fork of the LLVM project.
|
||||
* Added support for C++ Parallel Algorithm Offload via HIP (HIPSTDPAR), which allows parallel
|
||||
algorithms to run on the GPU.
|
||||
|
||||
##### Changes
|
||||
|
||||
* `rocm-clang-ocl` is now an optional package and will require manual installation.
|
||||
|
||||
##### Deprecations
|
||||
|
||||
* hipCC adds `-mllvm`, `-amdgpu-early-inline-all=true`, and `-mllvm` `-amdgpu-function-calls=false` by
|
||||
default to compiler invocations. These flags will be removed from hipCC in a future ROCm release.
|
||||
|
||||
##### Fixes
|
||||
|
||||
AddressSanitizer (ASan):
|
||||
* Added `sanitized_padded_global` LLVM ir attribute to identify sanitizer instrumented globals.
|
||||
* For ASan instrumented global, emit two symbols: one with actual size and the other with
|
||||
instrumented size.
|
||||
|
||||
[On GitHub](https://github.com/ROCm/ROCm/issues/2551)
|
||||
|
||||
##### Known issues
|
||||
|
||||
* Due to an issue within the `amd-llvm` compiler shipping with ROCm 6.1, HIPSTDPAR's interposition mode, which is enabled by `--hipstdpar-interpose-alloc` is currently broken.
|
||||
|
||||
The temporary workaround is to use the upstream LLVM 18 (or newer) compiler. This issue will be addressed in a future ROCm release ."
|
||||
|
||||
#### ROCm Data Center (RDC)
|
||||
|
||||
RDC for ROCm 6.1.0
|
||||
|
||||
##### Changes
|
||||
|
||||
* Added `--address` flag to rdcd
|
||||
* Upgraded from C++11 to C++17
|
||||
* Upgraded gRPC
|
||||
|
||||
#### ROCDebugger (ROCgdb)
|
||||
|
||||
ROCgdb for ROCm 6.1.0
|
||||
|
||||
##### Fixes
|
||||
|
||||
Previously, ROCDebugger encountered hangs and crashes when stepping over the `s_endpgm`
|
||||
instruction at the end of a HIP kernel entry function, which caused the stepped wave to exit. This issue
|
||||
is fixed in the ROCm 6.1 release. You can now step over the last instruction of any HIP kernel without
|
||||
debugger hangs or crashes.
|
||||
|
||||
#### ROCm SMI
|
||||
|
||||
ROCm SMI for ROCm 6.1.0
|
||||
|
||||
##### Additions
|
||||
|
||||
* **Added support to set max/min clock level for sclk ('RSMI_CLK_TYPE_SYS') or mclk ('RSMI_CLK_TYPE_MEM')**.
|
||||
You can now set a maximum or minimum `sclk` or `mclk` value through the
|
||||
`rsmi_dev_clk_extremum_set()` API provided ASIC support. Alternatively, you can use our Python CLI
|
||||
tool (`rocm-smi --setextremum max sclk 1500`).
|
||||
|
||||
* **Added `rsmi_dev_target_graphics_version_get()`**. You can now query through ROCm SMI API
|
||||
(`rsmi_dev_target_graphics_version_get()`) to retreive the target graphics version for a GPU device.
|
||||
Currently, this output is not supplied through our ROCm SMI CLI.
|
||||
|
||||
##### Changes
|
||||
|
||||
* **Removed non-unified API headers: Individual GPU metric APIs are no longer supported**.
|
||||
The individual metric APIs (`rsmi_dev_metrics_*`) were removed in order to keep updates easier for
|
||||
new GPU metric support. By providing a simple API (`rsmi_dev_gpu_metrics_info_get()`) with its
|
||||
reported device metrics, it is worth noting there is a risk for ABI break-age using
|
||||
`rsmi_dev_gpu_metrics_info_get()`. It is vital to understand that ABI breaks are necessary (in some
|
||||
cases) in order to support newer ASICs and metrics for our customers. We will continue to support
|
||||
`rsmi_dev_gpu_metrics_info_get()` with these considerations and limitations in mind.
|
||||
|
||||
* **Deprecated 'rsmi_dev_power_ave_get()'; use the newer API, 'rsmi_dev_power_get()'**. As
|
||||
outlined in the change for 6.0.0 (*Added a generic power API: rsmi_dev_power_get*), is now
|
||||
deprecated. You must update your ROCm SMI API calls accordingly.
|
||||
|
||||
##### Fixes
|
||||
|
||||
* Fixed `--showpids` reporting `[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN`.
|
||||
Output was failing because `cu_occupancy debugfs` method is not provided on some graphics cards
|
||||
by design. `get_compute_process_info_by_pid` was updated to reflect this and returns with the output
|
||||
needed by the CLI.
|
||||
|
||||
* Fixed `rocm-smi --showpower` output, which was inconsistent on some RDNA3 devices.
|
||||
We updated this to use `rsmi_dev_power_get()` within the CLI to provide a consistent device power
|
||||
output. This was caused by using the now-deprecated `rsmi_dev_average_power_get()` API.
|
||||
|
||||
* Fixed `rocm-smi --setcomputepartition` and `rocm-smi --resetcomputepartition` to notate if device is
|
||||
`EBUSY`
|
||||
|
||||
* Fixed `rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition` read only SYSFS to
|
||||
return `RSMI_STATUS_NOT_SUPPORTED`
|
||||
The `rsmi_dev_memory_partition_set` API is updated to handle the read-only SYSFS check.
|
||||
Corresponding tests and CLI (`rocm-smi --setmemorypartition` and
|
||||
`rocm-smi --resetmemorypartition`) calls were updated accordingly.
|
||||
|
||||
* Fixed `rocm-smi --showclkvolt` and `rocm-smi --showvc`, which were displaying 0 for overdrive and
|
||||
that the voltage curve is not supported.
|
||||
|
||||
#### ROCProfiler
|
||||
|
||||
ROCProfiler for ROCm 6.1.0
|
||||
|
||||
##### Fixes
|
||||
|
||||
* Fixed ROCprofiler to match versioning changes in HIP Runtime
|
||||
* Fixed plugins race condition
|
||||
* Updated metrics to MI300
|
||||
|
||||
#### ROCm Validation Suite
|
||||
|
||||
##### Known issue
|
||||
|
||||
* In a future release, the ROCm Validation Suite P2P Benchmark and Qualification Tool (PBQT) tests will be optimized to meet the target bandwidth requirements for MI300X.
|
||||
|
||||
[On GitHub](https://github.com/ROCm/ROCm/issues/3027)
|
||||
|
||||
#### MI200 SR-IOV
|
||||
|
||||
##### Known issue
|
||||
|
||||
* Multimedia applications may encounter compilation errors in the MI200 Single Root Input/Output Virtualization (SR-IOV) environment. This is because MI200 SR-IOV does not currently support multimedia applications.
|
||||
|
||||
[On GitHub](https://github.com/ROCm/ROCm/issues/3028)
|
||||
|
||||
### AMD MI300A RAS
|
||||
|
||||
#### Fixed defect
|
||||
|
||||
##### GFX correctable and uncorrectable error inject failures
|
||||
|
||||
* Previously, the AMD CPU Reliability, Availability, and Serviceability (RAS) installation encountered correctable and uncorrectable failures while injecting an error.
|
||||
|
||||
This issue is resolved in the ROCm 6.1 release, and users will no longer encounter the GFX correctable error (CE) and uncorrectable error (UE) failures.
|
||||
@@ -1,2 +1,2 @@
|
||||
from .defaults import TEMPLATES, PROCESSORS
|
||||
from . import mivisionx
|
||||
from .custom_templates import hipfort, mivisionx, rpp, rvs
|
||||
|
||||
0
tools/autotag/util/custom_templates/__init__.py
Normal file
0
tools/autotag/util/custom_templates/__init__.py
Normal file
41
tools/autotag/util/custom_templates/ck.py
Normal file
41
tools/autotag/util/custom_templates/ck.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import re
|
||||
|
||||
from util.release_data import ReleaseLib
|
||||
from util.defaults import TEMPLATES, PROCESSORS
|
||||
|
||||
TEMPLATES['composable_kernel'] = (
|
||||
(
|
||||
r"## (\(Unreleased\))? CK (?P<lib_version>\d+\.\d+(?:\.\d+))?"
|
||||
r"(?P<for_rocm> for ROCm )?"
|
||||
r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
|
||||
r"\n"
|
||||
r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def composable_kernel_processor(data: ReleaseLib, template: str, _, __) -> bool:
|
||||
"""Processor for releases."""
|
||||
changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
|
||||
changelog = changelog.decoded_content.decode()
|
||||
pattern = re.compile(template)
|
||||
match = pattern.search(changelog)
|
||||
lib_version = match["lib_version"]
|
||||
data.message = (
|
||||
f"composable_kernel for ROCm"
|
||||
f" {data.full_version}"
|
||||
)
|
||||
|
||||
data.lib_version = lib_version
|
||||
data.notes = f"""{match["body"]}"""
|
||||
|
||||
change_pattern = re.compile(
|
||||
r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
|
||||
re.RegexFlag.MULTILINE
|
||||
)
|
||||
for match in change_pattern.finditer(data.notes):
|
||||
data.data.changes[match["type"]] = match["change"]
|
||||
|
||||
return True
|
||||
|
||||
PROCESSORS['composable_kernel'] = composable_kernel_processor
|
||||
42
tools/autotag/util/custom_templates/hipfort.py
Normal file
42
tools/autotag/util/custom_templates/hipfort.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import re
|
||||
|
||||
from util.release_data import ReleaseLib
|
||||
from util.defaults import TEMPLATES, PROCESSORS
|
||||
|
||||
TEMPLATES['hipfort'] = (
|
||||
(
|
||||
r"## hipfort (?P<lib_version>\d+\.\d+(?:\.\d+))?"
|
||||
r"(?P<for_rocm> for ROCm )?"
|
||||
r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
|
||||
r"( \(Unreleased\))?"
|
||||
r"\n"
|
||||
r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def hipfort_processor(data: ReleaseLib, template: str, _, __) -> bool:
|
||||
"""Processor for releases."""
|
||||
changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
|
||||
changelog = changelog.decoded_content.decode()
|
||||
pattern = re.compile(template)
|
||||
match = pattern.search(changelog)
|
||||
lib_version = match["lib_version"]
|
||||
data.message = (
|
||||
f"hipfort for ROCm"
|
||||
f" {data.full_version}"
|
||||
)
|
||||
|
||||
data.lib_version = lib_version
|
||||
data.notes = f"""{match["body"]}"""
|
||||
|
||||
change_pattern = re.compile(
|
||||
r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
|
||||
re.RegexFlag.MULTILINE
|
||||
)
|
||||
for match in change_pattern.finditer(data.notes):
|
||||
data.data.changes[match["type"]] = match["change"]
|
||||
|
||||
return True
|
||||
|
||||
PROCESSORS['hipfort'] = hipfort_processor
|
||||
24
tools/autotag/util/mivisionx.py → tools/autotag/util/custom_templates/mivisionx.py
Executable file → Normal file
24
tools/autotag/util/mivisionx.py → tools/autotag/util/custom_templates/mivisionx.py
Executable file → Normal file
@@ -13,12 +13,13 @@ TEMPLATES['MIVisionX'] = (
|
||||
)
|
||||
|
||||
|
||||
def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
|
||||
def mivisionx_processor(data: ReleaseLib, template: str, _, __) -> bool:
|
||||
"""Processor for MIVisionX releases."""
|
||||
changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
|
||||
changelog = changelog.decoded_content.decode()
|
||||
pattern = re.compile(template)
|
||||
match = pattern.search(changelog)
|
||||
lib_version = match["lib_version"]
|
||||
data.message = (
|
||||
f"MIVisionX for ROCm"
|
||||
f" {data.full_version}"
|
||||
@@ -27,19 +28,18 @@ def mivisionx_processor(data: ReleaseLib, template: str, _) -> bool:
|
||||
readme = data.repo.get_contents("README.md", data.commit)
|
||||
readme = readme.decoded_content.decode()
|
||||
dependency_map = readme[readme.find("## MIVisionX Dependency Map"):]
|
||||
data.notes = f"""
|
||||
<p align="center">
|
||||
<img width="70%"
|
||||
src="https://github.com/ROCm/MIVisionX/raw/master/docs/images/MIVisionX.png" />
|
||||
</p>
|
||||
|
||||
## Online Documentation
|
||||
[MIVisionX Documentation](https://rocm.docs.amd.com/projects/MIVisionX/en/latest/doxygen/html/index.html)
|
||||
## MIVisionX {match['lib_version']}
|
||||
{match["body"]}
|
||||
data.lib_version = lib_version
|
||||
data.notes = f"""{match["body"]}
|
||||
{dependency_map}
|
||||
"""
|
||||
|
||||
change_pattern = re.compile(
|
||||
r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
|
||||
re.RegexFlag.MULTILINE
|
||||
)
|
||||
for match in change_pattern.finditer(data.notes):
|
||||
data.data.changes[match["type"]] = match["change"]
|
||||
|
||||
return True
|
||||
|
||||
|
||||
PROCESSORS['MIVisionX'] = mivisionx_processor
|
||||
42
tools/autotag/util/custom_templates/rpp.py
Normal file
42
tools/autotag/util/custom_templates/rpp.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import re
|
||||
|
||||
from util.release_data import ReleaseLib
|
||||
from util.defaults import TEMPLATES, PROCESSORS
|
||||
|
||||
TEMPLATES['rpp'] = (
|
||||
(
|
||||
r"## RPP (?P<lib_version>\d+\.\d+(?:\.\d+))?"
|
||||
r"(?P<for_rocm> for ROCm )?"
|
||||
r"(?P<rocm_version>(?(for_rocm)\d+\.\d+(?:\.\d+)?|.*))?"
|
||||
r"( \(Unreleased\))?"
|
||||
r"\n"
|
||||
r"(?P<body>(?:(?!## ).*(?:(?!\n## )\n|(?=\n## )))*)"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def rpp_processor(data: ReleaseLib, template: str, _, __) -> bool:
|
||||
"""Processor for releases."""
|
||||
changelog = data.repo.get_contents("CHANGELOG.md", data.commit)
|
||||
changelog = changelog.decoded_content.decode()
|
||||
pattern = re.compile(template)
|
||||
match = pattern.search(changelog)
|
||||
lib_version = match["lib_version"]
|
||||
data.message = (
|
||||
f"rpp for ROCm"
|
||||
f" {data.full_version}"
|
||||
)
|
||||
|
||||
data.lib_version = lib_version
|
||||
data.notes = f"""{match["body"]}"""
|
||||
|
||||
change_pattern = re.compile(
|
||||
r"^#+ +(?P<type>[^\n]+)$\n*(?P<change>(^(?!#).*\n*)*)",
|
||||
re.RegexFlag.MULTILINE
|
||||
)
|
||||
for match in change_pattern.finditer(data.notes):
|
||||
data.data.changes[match["type"]] = match["change"]
|
||||
|
||||
return True
|
||||
|
||||
PROCESSORS['rpp'] = rpp_processor
|
||||
Reference in New Issue
Block a user