mirror of
https://github.com/ROCm/ROCm.git
synced 2026-02-01 01:45:18 -05:00
Compare commits
50 Commits
users/davi
...
hipdnn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69360270f5 | ||
|
|
2da4c460ad | ||
|
|
d1165b7359 | ||
|
|
f6652b4fad | ||
|
|
aa864ee964 | ||
|
|
0a834eff9e | ||
|
|
380898f4a8 | ||
|
|
fc6332c6b3 | ||
|
|
a85735d430 | ||
|
|
c6bf8d2054 | ||
|
|
76570de120 | ||
|
|
fbd90eccfc | ||
|
|
0c74bc889f | ||
|
|
599328c44e | ||
|
|
c311dce297 | ||
|
|
656cb08d64 | ||
|
|
8c28f9ca9f | ||
|
|
b30ac2f3a2 | ||
|
|
1c19e8bd77 | ||
|
|
6261b2c421 | ||
|
|
decd7e712c | ||
|
|
b7dd7e24ed | ||
|
|
77cdb4eb56 | ||
|
|
8aa43d132f | ||
|
|
45bd726f55 | ||
|
|
33fbde69db | ||
|
|
7ab402a3b3 | ||
|
|
2851f89992 | ||
|
|
d31ca06bea | ||
|
|
cf3052ded9 | ||
|
|
7068119ae3 | ||
|
|
0bb5a15def | ||
|
|
7617a8afe1 | ||
|
|
b8d7408003 | ||
|
|
24909c0400 | ||
|
|
8d076740b8 | ||
|
|
5aef686b67 | ||
|
|
5b12c9a80e | ||
|
|
61d2424ab7 | ||
|
|
2e3500a111 | ||
|
|
fa4bf5e9ba | ||
|
|
2e506f1ae7 | ||
|
|
56b684fcae | ||
|
|
b3e78704f5 | ||
|
|
756fad8435 | ||
|
|
f84d9574a8 | ||
|
|
377d2631e3 | ||
|
|
00683dc244 | ||
|
|
535b051b8d | ||
|
|
18515bcc59 |
@@ -62,14 +62,9 @@ parameters:
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- AMDMIGraphX
|
||||
- clr
|
||||
- half
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
@@ -82,12 +77,7 @@ parameters:
|
||||
- aomp
|
||||
- clr
|
||||
- half
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- rocBLAS
|
||||
- rocprofiler-register
|
||||
- ROCR-Runtime
|
||||
- roctracer
|
||||
- rpp
|
||||
|
||||
@@ -32,7 +32,6 @@ parameters:
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- gfortran
|
||||
- git
|
||||
- libboost-program-options-dev
|
||||
@@ -42,6 +41,7 @@ parameters:
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- aomp
|
||||
- clr
|
||||
- llvm-project
|
||||
- rocminfo
|
||||
@@ -51,6 +51,7 @@ parameters:
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
- aomp
|
||||
- clr
|
||||
- llvm-project
|
||||
- hipBLAS-common
|
||||
@@ -103,6 +104,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -128,6 +130,7 @@ jobs:
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
|
||||
-DCMAKE_Fortran_COMPILER=gfortran
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DBUILD_CLIENTS_TESTS=ON
|
||||
-DBUILD_CLIENTS_SAMPLES=OFF
|
||||
|
||||
@@ -60,6 +60,7 @@ parameters:
|
||||
- rocprofiler-register
|
||||
- ROCR-Runtime
|
||||
- roctracer
|
||||
- rocSPARSE
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -74,6 +75,7 @@ parameters:
|
||||
- rocprofiler-register
|
||||
- ROCR-Runtime
|
||||
- roctracer
|
||||
- rocSPARSE
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
|
||||
@@ -71,6 +71,7 @@ parameters:
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
|
||||
@@ -47,8 +47,10 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- nanobind>=2.0.0
|
||||
- numpy
|
||||
- pytest
|
||||
- pytest-cov
|
||||
- torch
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -101,8 +103,7 @@ jobs:
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
@@ -239,7 +240,7 @@ jobs:
|
||||
targetType: inline
|
||||
workingDirectory: build
|
||||
script: |
|
||||
cmake --build . --target origami-tests origami_python -- -j$(nproc)
|
||||
cmake --build . --target origami-tests _pyorigami -- -j$(nproc)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
# Run tests using CTest (discovers and runs both C++ and Python tests)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
|
||||
@@ -36,8 +36,10 @@ Andrej
|
||||
Arb
|
||||
Autocast
|
||||
autograd
|
||||
Backported
|
||||
BARs
|
||||
BatchNorm
|
||||
BKC
|
||||
BLAS
|
||||
BMC
|
||||
BabelStream
|
||||
@@ -52,6 +54,7 @@ CDNA
|
||||
CGUI
|
||||
CHTML
|
||||
CIFAR
|
||||
CNP
|
||||
CLI
|
||||
CLion
|
||||
CMake
|
||||
@@ -95,6 +98,7 @@ Dashboarding
|
||||
Dataloading
|
||||
dataflows
|
||||
DBRX
|
||||
DCQCN
|
||||
DDR
|
||||
DF
|
||||
DGEMM
|
||||
@@ -109,8 +113,10 @@ DMA
|
||||
DOMContentLoaded
|
||||
DNN
|
||||
DNNL
|
||||
DOCA
|
||||
DPM
|
||||
DRI
|
||||
DSCP
|
||||
DW
|
||||
DWORD
|
||||
Dask
|
||||
@@ -126,7 +132,9 @@ Deprecations
|
||||
DevCap
|
||||
DirectX
|
||||
Disaggregated
|
||||
disagg
|
||||
disaggregated
|
||||
disaggregation
|
||||
Dockerfile
|
||||
Dockerized
|
||||
Doxygen
|
||||
@@ -178,6 +186,8 @@ GFLOPS
|
||||
GFortran
|
||||
GFXIP
|
||||
GGUF
|
||||
GID
|
||||
Gbps
|
||||
Gemma
|
||||
GiB
|
||||
GIM
|
||||
@@ -203,9 +213,11 @@ GenAI
|
||||
GenZ
|
||||
GitHub
|
||||
Gitpod
|
||||
hardcoded
|
||||
HBM
|
||||
HCA
|
||||
HGX
|
||||
HLO
|
||||
HIPCC
|
||||
hipDataType
|
||||
HIPExtension
|
||||
@@ -245,6 +257,7 @@ IOP
|
||||
IOPS
|
||||
IOPM
|
||||
IOV
|
||||
IPs
|
||||
IRQ
|
||||
ISA
|
||||
ISV
|
||||
@@ -309,6 +322,7 @@ MNIST
|
||||
MPI
|
||||
MPT
|
||||
MSVC
|
||||
MTU
|
||||
mul
|
||||
MVAPICH
|
||||
MVFFR
|
||||
@@ -331,8 +345,10 @@ MLA
|
||||
MosaicML
|
||||
MoEs
|
||||
Mooncake
|
||||
MoRI
|
||||
Mpops
|
||||
Multicore
|
||||
multihost
|
||||
Multithreaded
|
||||
mx
|
||||
MXFP
|
||||
@@ -399,16 +415,21 @@ PEQT
|
||||
PIL
|
||||
PILImage
|
||||
PJRT
|
||||
PLDM
|
||||
POR
|
||||
PRNG
|
||||
PRs
|
||||
PSID
|
||||
PTPC
|
||||
PaLM
|
||||
Pageable
|
||||
PeerDirect
|
||||
Pensando
|
||||
PerfDb
|
||||
Perfetto
|
||||
PipelineParallel
|
||||
PnP
|
||||
Pollara
|
||||
PowerEdge
|
||||
PowerShell
|
||||
Pretrained
|
||||
@@ -420,6 +441,7 @@ Pytest
|
||||
PyTorch
|
||||
QPS
|
||||
Qcycles
|
||||
QoS
|
||||
Qwen
|
||||
RAII
|
||||
RAS
|
||||
@@ -453,6 +475,7 @@ RPP
|
||||
RST
|
||||
RW
|
||||
Radeon
|
||||
Redfish
|
||||
RelWithDebInfo
|
||||
Req
|
||||
Rickle
|
||||
@@ -720,6 +743,7 @@ enqueue
|
||||
env
|
||||
epilog
|
||||
etcetera
|
||||
eth
|
||||
ethernet
|
||||
exascale
|
||||
executables
|
||||
@@ -815,6 +839,7 @@ llvm
|
||||
lm
|
||||
localscratch
|
||||
logits
|
||||
loopback
|
||||
lossy
|
||||
macOS
|
||||
matchers
|
||||
@@ -840,6 +865,7 @@ nanoGPT
|
||||
NCS
|
||||
NOP
|
||||
NVLink
|
||||
netplan
|
||||
num
|
||||
numref
|
||||
ocl
|
||||
@@ -907,6 +933,7 @@ rc
|
||||
rccl
|
||||
rdc
|
||||
rdma
|
||||
reachability
|
||||
reStructuredText
|
||||
redirections
|
||||
refactorization
|
||||
@@ -976,6 +1003,7 @@ shader
|
||||
sharding
|
||||
sigmoid
|
||||
sles
|
||||
slurm
|
||||
sm
|
||||
smi
|
||||
softmax
|
||||
@@ -1027,6 +1055,7 @@ uncacheable
|
||||
uncorrectable
|
||||
underoptimized
|
||||
unhandled
|
||||
unfused
|
||||
uninstallation
|
||||
unmapped
|
||||
unsqueeze
|
||||
|
||||
687
CHANGELOG.md
687
CHANGELOG.md
@@ -4,6 +4,693 @@ This page is a historical overview of changes made to ROCm components. This
|
||||
consolidated changelog documents key modifications and improvements across
|
||||
different versions of the ROCm software stack and its components.
|
||||
|
||||
## ROCm 7.2.0
|
||||
|
||||
See the [ROCm 7.2.0 release notes](https://rocm.docs.amd.com/en/docs-7.2.0/about/release-notes.html#rocm-7-2-0-release-notes)
|
||||
for a complete overview of this release.
|
||||
|
||||
### **AMD SMI** (26.2.1)
|
||||
|
||||
#### Added
|
||||
|
||||
- GPU and baseboard temperature options to `amd-smi monitor` CLI.
|
||||
- `amd-smi monitor --gpu-board-temps` for GPU board temperature sensors.
|
||||
- `amd-smi monitor --base-board-temps` for base board temperature sensors.
|
||||
|
||||
(amdsmi-npm-changelog)=
|
||||
- New Node Power Management (NPM) APIs and CLI options for node monitoring.
|
||||
- C++ API functions:
|
||||
- `amdsmi_get_node_handle()` gets the handle for a node device.
|
||||
- `amdsmi_get_npm_info()` retrieves Node Power Management information.
|
||||
- C++ types:
|
||||
- `amdsmi_npm_status_t` indicates whether NPM is enabled or disabled.
|
||||
- `amdsmi_npm_info_t` contains the status and node-level power limit in watts.
|
||||
- Added Python API wrappers for new node device functions.
|
||||
- Added `amd-smi node` subcommand for NPM operations via CLI.
|
||||
- Currently supported for `OAM_ID 0` only.
|
||||
|
||||
- The following C APIs are added to `amdsmi_interface.py`:
|
||||
- `amdsmi_get_cpu_handle()`
|
||||
- `amdsmi_get_esmi_err_msg()`
|
||||
- `amdsmi_get_gpu_event_notification()`
|
||||
- `amdsmi_get_processor_count_from_handles()`
|
||||
- `amdsmi_get_processor_handles_by_type()`
|
||||
- `amdsmi_gpu_validate_ras_eeprom()`
|
||||
- `amdsmi_init_gpu_event_notification()`
|
||||
- `amdsmi_set_gpu_event_notification_mask()`
|
||||
- `amdsmi_stop_gpu_event_notification()`
|
||||
- `amdsmi_get_gpu_busy_percent()`
|
||||
|
||||
- Additional return value to `amdsmi_get_xgmi_plpd()` API:
|
||||
- The entry `policies` is added to the end of the dictionary to match API definition.
|
||||
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
|
||||
|
||||
- PCIe levels to `amd-smi static --bus` command.
|
||||
- The static `--bus` option has been updated to include the range of PCIe levels that you can set for a device.
|
||||
- Levels are a 2-tuple composed of the PCIe speed and bandwidth.
|
||||
|
||||
- `evicted_time` metric for KFD processes.
|
||||
- Time that queues are evicted on a GPU in milliseconds.
|
||||
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`.
|
||||
- Added to C APIs and Python APIs: `amdsmi_get_gpu_process_list()`, `amdsmi_get_gpu_compute_process_info()`
|
||||
, and `amdsmi_get_gpu_compute_process_info_by_pid()`.
|
||||
|
||||
- New VRAM types to `amdsmi_vram_type_t`.
|
||||
- `amd-smi static --vram` and `amdsmi_get_gpu_vram_info()` now support the following types: `DDR5`, `LPDDR4`, `LPDDR5`, and `HBM3E`.
|
||||
|
||||
- Support for PPT1 power limit information.
|
||||
- Support has been added for querying and setting the PPT (Package Power Tracking) limits.
|
||||
- There are two PPT limits. PPT0 has lower limit and tracks a filtered version of the input power. PPT1 has higher limit but tracks the raw input power. This is to catch spikes in the raw data.
|
||||
- New API added:
|
||||
- `amdsmi_get_supported_power_cap()`: Returns power cap types supported on the device (PPT0, PPT1). This will allow you to know which power cap types you can get/set.
|
||||
- Original APIs remain the same but now can get/set both PPT0 and PPT1 limits (on supported hardware): `amdsmi_get_power_cap_info()` and `amdsmi_set_power_cap()`.
|
||||
- See the Changed section for changes made to the `set` and `static` commands regarding support for PPT1.
|
||||
|
||||
#### Changed
|
||||
|
||||
- The `amd-smi` command now shows `hsmp` rather than `amd_hsmp`.
|
||||
- The `hsmp` driver version can be shown without the `amdgpu` version using `amd-smi version -c`.
|
||||
|
||||
- The `amd-smi set --power-cap` command now requires specification of the power cap type.
|
||||
- Command now takes the form: `amd-smi set --power-cap <power-cap-type> <new-cap>`.
|
||||
- Acceptable power cap types are "ppt0" and "ppt1".
|
||||
|
||||
- The `amd-smi reset --power-cap` command will now attempt to reset both `PPT0` and `PPT1` power caps to their default values. If a device only has `PPT0`, then only `PPT0` will be reset.
|
||||
|
||||
- The `amd-smi static --limit` command now has a `PPT1` section when PPT1 is available. The `static --limit` command has been updated to include `PPT1` power limit information when available on the device.
|
||||
|
||||
#### Resolved Issues
|
||||
|
||||
- Fixed an issue where `amdsmi_get_gpu_od_volt_info()` returned a reference to a Python object. The returned dictionary was changed to return values in all fields.
|
||||
|
||||
### **Composable Kernel** (1.2.0)
|
||||
|
||||
#### Added
|
||||
* Support for mixed precision fp8 x bf8 universal GEMM and weight preshuffle GEMM.
|
||||
* Compute async pipeline in the CK Tile universal GEMM on gfx950.
|
||||
* Support for B Tensor type `pk_int4_t` in the CK Tile weight preshuffle GEMM.
|
||||
* New call to load different memory sizes to SGPR.
|
||||
* Support for B Tensor Preshuffle in CK Tile Grouped GEMM.
|
||||
* Basic copy kernel example and supporting documentation for new CK Tile developers.
|
||||
* Support for `grouped_gemm` kernels to perform `multi_d` elementwise operation.
|
||||
* Support for Multiple ABD GEMM.
|
||||
* Benchmarking support for tile engine GEMM Multi D.
|
||||
* Block scaling support in CK Tile GEMM, allowing flexible use of quantization matrices from either A or B operands.
|
||||
* Row-wise and column-wise quantization for CK Tile GEMM and grouped GEMM.
|
||||
* Support for `f32` to FMHA (fwd/bwd).
|
||||
* Tensor-wise quantization for CK Tile GEMM.
|
||||
* Support for batched contraction kernel.
|
||||
* WMMA (gfx12) support for FMHA.
|
||||
* Pooling kernel in CK Tile.
|
||||
* Top-k sigmoid kernel in CK Tile.
|
||||
* Blockscale 2D support for CK Tile GEMM.
|
||||
* An optional template parameter, `Arch`, to `make_kernel` to support linking multiple object files that have the same kernel compiled for different architectures.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Removed `BlockSize` in `make_kernel` and `CShuffleEpilogueProblem` to support Wave32 in CK Tile.
|
||||
* FMHA examples and tests can be built for multiple architectures (gfx9, gfx950, gfx12) at the same time.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Composable Kernel will be adopting C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment complies with this requirement to facilitate a seamless transition.
|
||||
* In an upcoming major ROCm release, Composable Kernel will transition to a header-only library. Neither ckProfiler nor the static libraries will be packaged with Composable Kernel. They will also no longer be built by default. ckProfiler can be built independently from Composable Kernel as a standalone binary, and the static Composable Kernel libraries can be built from source.
|
||||
|
||||
### **HIP** (7.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* New HIP APIs
|
||||
- `hipLibraryEnumerateKernels` returns kernel handles within a library.
|
||||
- `hipKernelGetLibrary` returns library handle for a hipKernel_t handle.
|
||||
- `hipKernelGetName` returns function name for a hipKernel_t handle.
|
||||
- `hipLibraryLoadData` creates library object from code.
|
||||
- `hipLibraryLoadFromFile` creates library object from file.
|
||||
- `hipLibraryUnload` unloads library.
|
||||
- `hipLibraryGetKernel` gets a kernel from the library.
|
||||
- `hipLibraryGetKernelCount` gets kernel count in library.
|
||||
- `hipStreamCopyAttributes` copies attributes from source stream to destination stream.
|
||||
- `hipOccupancyAvailableDynamicSMemPerBlock` returns dynamic shared memory available per block when launching numBlocks blocks on CU.
|
||||
* New HIP flags
|
||||
- `hipMemLocationTypeHost` enables handling virtual memory management in host memory location, in addition to device memory.
|
||||
- Support for flags in `hipGetProcAddress` enables searching for the per-thread version symbols:
|
||||
- `HIP_GET_PROC_ADDRESS_DEFAULT`
|
||||
- `HIP_GET_PROC_ADDRESS_LEGACY_STREAM`
|
||||
- `HIP_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM`
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Graph node scaling:
|
||||
- HIP runtime implements an optimized doorbell ring mechanism for certain topologies of graph execution. It enables efficient batching of graph nodes.
|
||||
- The enhancement provides better alignment with CUDA Graph optimizations.
|
||||
- HIP also adds a new performance test for HIP graphs with programmable topologies to measure graph performance across different structures.
|
||||
- The test evaluates graph instantiation time, first launch time, repeat launch times, and end-to-end execution for various graph topologies.
|
||||
- The test implements comprehensive timing measurements including CPU overhead and device execution time.
|
||||
* Back memory set (memset) optimization:
|
||||
- HIP runtime now implements a back memory set (memset) optimization to improve how memset nodes are processed during graph execution.
|
||||
- The enhancement specifically handles varying number of Architected Queue Language (AQL) packets for memset graph node due to graph node set params for AQL batch submission approach.
|
||||
* Async handler performance improvement:
|
||||
- HIP runtime has removed the lock contention in async handler enqueue path.
|
||||
- - The enhancement reduces runtime overhead and maximizes GPU throughput for asynchronous kernel execution, especially in multi-threaded applications.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Corrected the calculation of the value of maximum shared memory per multiprocessor, in HIP device properties.
|
||||
|
||||
### **hipBLAS** (3.2.0)
|
||||
|
||||
#### Resolved issues
|
||||
* Corrected client memory use counts for the `HIPBLAS_CLIENT_RAM_GB_LIMIT` environment variable.
|
||||
* Fixed false Clang static analysis warnings.
|
||||
|
||||
### **hipBLASLt** (1.2.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for the `BF16` input data type with an `FP32` output data type for gfx90a.
|
||||
* Support for hipBLASLtExt operation APIs on gfx11XX and gfx12XX.
|
||||
* `HIPBLASLT_OVERRIDE_COMPUTE_TYPE_XF32` to override the compute type from `xf32` to other compute types.
|
||||
* Support for the Sigmoid Activation function.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed the `HIPBLAS_STATUS_INTERNAL_ERROR` issue that could occur with various sizes in CPX mode.
|
||||
|
||||
### **hipCUB** (4.2.0)
|
||||
|
||||
#### Added
|
||||
* Experimental SPIR-V support.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed memory leak issues with some unit tests.
|
||||
|
||||
### **hipFFT** (1.0.22)
|
||||
|
||||
#### Added
|
||||
|
||||
* hipFFTW execution functions, where input and output data buffers differ from the
|
||||
buffers specified at plan creation:
|
||||
|
||||
* fftw_execute_dft
|
||||
* fftwf_execute_dft
|
||||
* fftw_execute_dft_r2c
|
||||
* fftwf_execute_dft_r2c
|
||||
* fftw_execute_dft_c2r
|
||||
* fftwf_execute_dft_c2r
|
||||
|
||||
### **HIPIFY** (22.0.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Partial support for CUDA 13.0.0 support.
|
||||
* cuDNN 9.14.0 support.
|
||||
* cuTENSOR 2.3.1.0 support.
|
||||
* LLVM 21.1.6 support.
|
||||
* Full `hipFFTw` support.
|
||||
* [#2062](https://github.com/ROCm/HIPIFY/issues/2062) Partial hipification support for a particular CUDA API.
|
||||
* [#2073](https://github.com/ROCm/HIPIFY/issues/2073) Detect CUDA version before hipification.
|
||||
* New options:
|
||||
* `--local-headers` to enable hipification of quoted local headers (non-recursive).
|
||||
* `--local-headers-recursive` to enable hipification of quoted local headers recursively.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* [#2088](https://github.com/ROCm/HIPIFY/issues/2088) Missing support of `cuda_bf16.h` import in hipification.
|
||||
|
||||
### **hipSOLVER** (3.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Ability to control rocSOLVER logging using the environment variables `ROCSOLVER_LEVELS` and `ROCSOLVER_LAYER`.
|
||||
|
||||
### **hipSPARSE** (4.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `--clients-only` option to the `install.sh` and `rmake.py` scripts for building only the clients when using a version of hipSPARSE that is already installed.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved the user documentation.
|
||||
|
||||
#### Resolved Issues
|
||||
|
||||
* Fixed a memory leak in the `hipsparseCreate` functions.
|
||||
|
||||
### **hipSPARSELt** (0.2.6)
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Provided more kernels for the `FP16` and `FP8(E4M3)` data types.
|
||||
|
||||
### **hipTensor** (2.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Software-managed plan cache support.
|
||||
* `hiptensorHandleWritePlanCacheToFile` to write the plan cache of a hipTensor handle to a file.
|
||||
* `hiptensorHandleReadPlanCacheFromFile` to read a plan cache from a file into a hipTensor handle.
|
||||
* `simple_contraction_plan_cache` to demonstrate plan cache usages.
|
||||
* `plan_cache_test` to test the plan cache across various tensor ranks.
|
||||
* C API headers to enable compatibility with C programs.
|
||||
* A CMake function to allow projects to query architecture support.
|
||||
* An option to configure the memory layout for tests and benchmarks.
|
||||
|
||||
#### Changed
|
||||
|
||||
* hipTensor has been moved into the new rocm-libraries "monorepo" repository {fab}`github` [rocm-libraries](https://github.com/ROCm/rocm-libraries). This repository consolidates a number of separate ROCm libraries and shared components.
|
||||
* The repository migration requires a few changes to the CMake configuration of hipTensor.
|
||||
* Updated C++ standard from C++17 to C++20.
|
||||
* Include files `hiptensor/hiptensor.hpp` and `hiptensor/hiptensor_types.hpp` are now deprecated. Use `hiptensor/hiptensor.h` and `hiptensor/hiptensor_types.h` instead.
|
||||
* Converted include guards from #ifndef/#define/#endif to #pragma once.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Removed large tensor sizes causing problem in benchmarks.
|
||||
|
||||
### **llvm-project** (22.0.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Enabled ThinLTO for ROCm compilers using `-foffload-lto=thin`. For more information, see [ROCm compiler reference](https://rocm.docs.amd.com/projects/llvm-project/en/docs-7.2.0/reference/rocmcc.html#amd-gpu-compilation).
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated clang/llvm to AMD clang version 22.0.0 (equivalent to LLVM 22.0.0 with additional out-of-tree patches).
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* As of ROCm 7.2.0, the [HIPCC](https://rocm.docs.amd.com/projects/HIPCC/en/latest/index.html) compiler is deprecated. HIPCC now invokes [AMD Clang](https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html). It’s recommended that you now invoke AMD Clang directly rather than using HIPCC. There isn’t any expected impact on usability, functionality, or performance when invoking AMD Clang directly. In a future ROCm release, HIPCC will become a symbolic link to AMD Clang.
|
||||
|
||||
### **MIGraphX** (2.15.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* MXFP4 support for Quark and Brevitas quantized models.
|
||||
* Dynamic shape support for `DepthToSpace Op`.
|
||||
* `bias` and `key_mask_padding` inputs for the `MultiHeadAttention` operator.
|
||||
* GEMM+GEMM fusions.
|
||||
* `dim_params` input parameter to the `parse_onnx` Python call.
|
||||
* Created an API to query supported ONNX Operators `get_onnx_operators()`.
|
||||
* Right pad masking mode for Multihead Attention.
|
||||
* Support for Flash Decoding.
|
||||
* Torch-MIGraphX installation instructions.
|
||||
* Operator Builders with supporting documentation.
|
||||
* Index range check to the Gather operator.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated the Resize operator to support linear mode for Dynamic shapes.
|
||||
* Switched to `--input-dim` instead of `--batch` to set any dynamic dimensions when using `migraphx-driver`.
|
||||
* Different stride sizes are now supported in ONNX `if` branches.
|
||||
* ONNX version change to 1.18.0 to support PyTorch 2.9.1.
|
||||
* Refactored `GroupQueryAttention`.
|
||||
* Enabled `PipelineRepoRef` parameter in CI.
|
||||
* Hide LLVM symbols that come from ROCmlir and provide option for stripping in release mode.
|
||||
* Model compilation failures now produce an mxr file for debugging the failure.
|
||||
* Bumped SQlite3 to 3.50.4.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Converted the `LRN` operator to an optimized `pooling` operator.
|
||||
* Streamlined the `find_matches` function.
|
||||
* Reduced the number of splits used for `split_reduce`.
|
||||
* Improved layout propagation in pointwise fusion when using broadcasted inputs.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Quiet nrvo and noreturn warnings.
|
||||
* Fixed `pointwise: Wrong number of arguments` error when quantizing certain models to `int8`.
|
||||
* TopK exception bugfix.
|
||||
* Updated SD3 example for change in optimum-onnx[onnxruntime].
|
||||
* Fixed an issue with Torch-MIGraphX where the model compilation would fail.
|
||||
* Fixed an issue where a reduction was broadcast with different dimensions than the input.
|
||||
* Resolved a path name issue stopping some files being created on Windows for debugging.
|
||||
* Fixed "reduce_sum: axes: value out of range" error in `simplify_reshapes`.
|
||||
* Updated README `rbuild` installation instructions to use Python venv to avoid warning.
|
||||
* Ensured directories exist when generating files for debugging.
|
||||
* Resolved a compilation hang issue.
|
||||
|
||||
### **MIOpen** (3.5.1)
|
||||
|
||||
#### Added
|
||||
* 3D heuristics for gfx950.
|
||||
* Optional timestamps to MIOpen logging.
|
||||
* Option to log when MIOpen starts and finishes tuning.
|
||||
* Winograd Fury 4.6.0 for gfx12 for improved convolution performance.
|
||||
|
||||
#### Changed
|
||||
* Ported several OCL kernels to HIP.
|
||||
|
||||
#### Optimized
|
||||
* Improved Composable Kernel (CK) kernel selection during tuning.
|
||||
* Improved user DB file locking to better handle network storage.
|
||||
* Improved performance for MIOpen check numerics capabilities.
|
||||
|
||||
#### Resolved issues
|
||||
* Addressed an issue in the stride adjustment logic for ASM (MISA) kernels when the output dimension is one.
|
||||
* Fixed an issue with the CK bwd solver applicability checks when deterministic is set.
|
||||
* [BatchNorm] Fixed issue where batchnorm tuning would give incorrect results.
|
||||
* Fixed issue where generic search was not providing sufficient warm-up for some kernels.
|
||||
|
||||
### **MIVisionX** (3.5.0)
|
||||
|
||||
#### Changed
|
||||
|
||||
* AMD Clang++ location updated to `${ROCM_PATH}/lib/llvm/bin`.
|
||||
* Required RPP version updated to RPP V2.2.1.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Memory leaks in OpenVX core, vx_nn, & vx_opencv.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Installation on RedHat and SLES requires the manual installation of the FFmpeg and OpenCV dev packages.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* VX_AMD_MEDIA - `rocDecode` and `rocJPEG` support for hardware decode.
|
||||
|
||||
### **RCCL** (2.27.7)
|
||||
|
||||
#### Changed
|
||||
|
||||
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
|
||||
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
|
||||
|
||||
### **rocAL** (2.5.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `EnumRegistry` to register all the enums present in rocAL.
|
||||
* `Argument` class which stores the value and type of each argument in the Node.
|
||||
* `PipelineOperator` class to represent operators in the pipeline with metadata.
|
||||
* Support to track operators in MasterGraph with unique naming.
|
||||
|
||||
#### Changed
|
||||
|
||||
* OpenCL backend support is deprecated.
|
||||
* CXX Compiler: Use AMDClang++ compiler core location `${ROCM_PATH}/lib/llvm/bin`.
|
||||
* Refactored external enum usage in rocAL to maintain separation between external and internal enums.
|
||||
* Introduced the following enums `ResizeScalingMode`, `ResizeInterpolationType`, `MelScaleFormula`, `AudioBorderType`, and `OutOfBoundsPolicy` in `commons.h`.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Use HIP memory for fused crop rocJPEG decoder.
|
||||
* Issue in numpy loader where ROI is updated incorrectly.
|
||||
* Issue in CropResize node where `crop_w` and `crop_h` values were not correctly updated.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Package installation on SLES requires manually installing `TurboJPEG`.
|
||||
* Package installation on RedHat and SLES requires manually installing the FFmpeg dev package.
|
||||
|
||||
### **rocALUTION** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `--clients-only` option to the `install.sh` and `rmake.py` scripts to allow building only the clients while using an already installed version of rocALUTION.
|
||||
|
||||
### **rocBLAS** (5.2.0)
|
||||
|
||||
#### Added
|
||||
* Level 3 `syrk_ex` function for both C and FORTRAN, without API support for the ILP64 format.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Level 2 `tpmv` and `sbmv` functions.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Corrected client memory use counts for the `ROCBLAS_CLIENT_RAM_GB_LIMIT` environment variable.
|
||||
* Fixed false Clang static analysis warnings.
|
||||
|
||||
### **rocDecode** (1.5.0)
|
||||
|
||||
#### Added
|
||||
* Logging control. Message output from the core components is now controlled by the logging level threshold, which can be set by an environment variable or other methods.
|
||||
* The new `rocdecode-host` package must be installed to use the FFmpeg decoder.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated `libdrm` path configuration and `libva` version requirements for ROCm and TheRock platforms.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed the build error with videodecodepicfiles sample.
|
||||
* Added error handling of sample app command option combination of memory type OUT_SURFACE_MEM_NOT_MAPPED and MD5 generation.
|
||||
|
||||
### **rocFFT** (1.0.36)
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Removed a potentially unnecessary global transpose operation from MPI 3D multi-GPU pencil decompositions.
|
||||
* Enabled optimization of 3D pencil decompositions for single-process multi-GPU transforms.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed potential division by zero when constructing plans using dimensions of length 1.
|
||||
* Fixed result scaling on multi-device transforms.
|
||||
* Fixed callbacks on multi-device transforms.
|
||||
|
||||
### **rocJPEG** (1.3.0)
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated `libdrm` path configuration and `libva` version requirements for ROCm and TheRock platforms.
|
||||
* RHEL now uses `libva-devel` instead of `libva-amdgpu`/`libva-amdgpu-devel`.
|
||||
* Use ROCm clang++ from `${ROCM_PATH}/lib/llvm/bin` location.
|
||||
|
||||
### **ROCm Bandwidth Test** (2.6.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* `rocm-bandwidth-test` folder is no longer present after driver uninstallation.
|
||||
|
||||
### **ROCm Compute Profiler** (3.4.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `--list-blocks <arch>` option to general options. It lists the available IP blocks on the specified arch (similar to `--list-metrics`). However, cannot be used with `--block`.
|
||||
|
||||
* `config_delta/gfx950_diff.yaml` to analysis config YAMLs to track the revision between the gfx9xx GPUs against the latest supported gfx950 GPUs.
|
||||
|
||||
* Analysis db features
|
||||
* Adds support for per kernel metrics analysis.
|
||||
* Adds support for dispatch timeline analysis.
|
||||
* Shows duration as median in addition to mean in kernel view.
|
||||
|
||||
* AMDGPU driver info and GPU VRAM attributes in the system info section of the analysis report.
|
||||
|
||||
* `CU Utilization` metric to display the percentage of CUs utilized during kernel execution.
|
||||
|
||||
#### Changed
|
||||
|
||||
* `-b/--block` accepts block alias(es). See block aliases using command-line option `--list-blocks <arch>`.
|
||||
|
||||
* Analysis configs YAMLs are now managed with the new config management workflow in `tools/config_management/`.
|
||||
|
||||
* `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications.
|
||||
|
||||
* Empty cells replaced with `N/A` for unavailable metrics in analysis.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Removed `database` mode from ROCm Compute Profiler in favor of other visualization methods, rather than Grafana and MongoDB integration, such as the upcoming Analysis DB-based Visualizer.
|
||||
* Plotly server based standalone GUI.
|
||||
* Commandline based Textual User Interface.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed issue of sL1D metric values displaying as `N/A` in memory chart diagram.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* `Active CUs` metric has been deprecated in favor of `CU Utilization` and will be removed in a future release.
|
||||
|
||||
### **ROCm Systems Profiler** (1.3.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- `ROCPROFSYS_PERFETTO_FLUSH_PERIOD_MS` configuration setting to set the flush period for Perfetto traces. The default value is 10000 ms (10 seconds).
|
||||
- Fetching of the `rocpd` schema from rocprofiler-sdk-rocpd.
|
||||
|
||||
#### Changed
|
||||
|
||||
- Improved Fortran main function detection to ensure `rocprof-sys-instrument` uses the Fortran program main function instead of the C wrapper.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed a crash when running `rocprof-sys-python` with ROCPROFSYS_USE_ROCPD enabled.
|
||||
- Fixed an issue where kernel/memory-copy events could appear on the wrong Perfetto track (e.g., queue track when stream grouping was requested) because _group_by_queue state leaked between records.
|
||||
- Fixed a soft hang in collecting available PAPI metrics on some systems with Intel CPU.
|
||||
- Fixed some duplicate HIP and HSA API events in `rocpd` output.
|
||||
|
||||
### **rocPRIM** (4.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Missing benchmarks, such that every autotuned specialization is now benchmarked.
|
||||
* A new cmake option, `BENCHMARK_USE_AMDSMI`. It is set to `OFF` by default. When this option is set to `ON`, it lets benchmarks use AMD SMI to output more GPU statistics.
|
||||
* The first tested example program for `device_search`.
|
||||
* `apply_config_improvements.py`file , which generates improved configs by taking the best specializations from old and new configs.
|
||||
* Run the script with `--help` for usage instructions, and see [rocPRIM Performance Tuning](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/conceptual/rocPRIM-performance-tuning.html#rocprim-performance-tuning) for more information.
|
||||
* Kernel Tuner proof-of-concept.
|
||||
* Enhanced SPIR-V support and performance.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved performance of `device_radix_sort` onesweep variant.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed the issue where `rocprim::device_scan_by_key` failed when performing an "in-place" inclusive scan by reusing "keys" as output, by adding a buffer to store the last keys of each block (excluding the last block). This fix only affects the specific case of reusing "keys" as output in an inclusive scan, and does not affect other cases.
|
||||
* Fixed benchmark build error on Windows.
|
||||
* Fixed offload compress build option.
|
||||
* Fixed `float_bit_mask` for `rocprim::half`.
|
||||
* Fixed handling of undefined behaviour when `__builtin_clz`, `__builtin_ctz`, and similar builtins are called.
|
||||
* Fixed potential build error with `rocprim::detail::histogram_impl`.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Potential hang with `rocprim::partition_threeway` with large input data sizes on later ROCm builds. A workaround is currently in place.
|
||||
|
||||
### **ROCprofiler-SDK** (1.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- Counter collection support for gfx1150 and gfx1151.
|
||||
- HSA Extension API v8 support.
|
||||
- `hipStreamCopyAttributes` API implementation.
|
||||
|
||||
#### Optimized
|
||||
|
||||
- Improved process attachment and updated the corresponding [documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3-process-attachment.html).
|
||||
- Improved [Quick reference guide for rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/quick_guide.html).
|
||||
- Updated the [installation documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/install/installation.html) with the links to the latest repository.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed multi-GPU dimension mismatch.
|
||||
- Fixed device lock issue for dispatch counters.
|
||||
- Addressed OpenMP Tools task scheduling null pointer exception.
|
||||
- Fixed stream ID errors arising during process attachment.
|
||||
- Fixed issues arising during dynamic code object loading.
|
||||
|
||||
### **rocPyDecode** (0.8.0)
|
||||
|
||||
#### Changed
|
||||
* CXX Compiler location - Use default `${ROCM_PATH}/lib/llvm/bin` for AMD Clang.
|
||||
|
||||
### **rocRAND** (4.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system.
|
||||
* Experimental SPIR-V support.
|
||||
|
||||
#### Changed
|
||||
|
||||
* The `launch` method in `host_system` and `device_system`, so that kernels with all supported arches can be compiled with correct configuration during host pass. All generators are updated accordingly for support of SPIR-V. To invoke SPIR-V, it should be built with `-DAMDGPU_TARGETS=amdgcnspirv`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* For performance reasons, the `mrg31k3p_state`, `mrg32k3a_state`, `xorwow_state` and `philox4x32_10_state` states no longer use the `boxmuller_float_state` and `boxmuller_double_state` states, and the `boxmuller_float` and `boxmuller_double` variables are set with `NaN` as default values.
|
||||
|
||||
|
||||
### **rocSHMEM** (3.2.0)
|
||||
|
||||
#### Added
|
||||
* The GDA conduit for AMD Pensando IONIC.
|
||||
|
||||
#### Changed
|
||||
* Dependency libraries are now loaded dynamically.
|
||||
* The following APIs now have an implementation for the GDA conduit:
|
||||
* `rocshmem_p`
|
||||
* fetching atomics `rocshmem_<TYPE>_fetch_<op>`
|
||||
* collective APIs
|
||||
* The following APIs now have an implementation for the IPC conduit:
|
||||
* `rocshmem_<TYPE>_atomic_{and,or,xor,swap}`
|
||||
* `rocshmem_<TYPE>_atomic_fetch_{and,or,xor,swap}`
|
||||
|
||||
#### Known issues
|
||||
* Only 64-bit rocSHMEM atomic APIs are implemented for the GDA conduit.
|
||||
|
||||
### **rocSOLVER** (3.32.0)
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved the performance of LARFB and downstream functions such as GEQRF and ORMTR.
|
||||
|
||||
### **rocSPARSE** (4.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Sliced ELL format support to the `rocsparse_spmv` routine.
|
||||
* The `rocsparse_sptrsv` and `rocsparse_sptrsm` routines for triangular solve.
|
||||
* The `--clients-only` option to the `install.sh` and `rmake.py` scripts to only build the clients for a version of rocSPARSE that is already installed.
|
||||
* NNZ split algorithm `rocsparse_spmv_alg_csr_nnzsplit` to `rocsparse_spmv`. This algorithm might be superior to the existing adaptive algorithm `rocsparse_spmv_alg_csr_adaptive` when running the computation a small number of times because it avoids paying the analysis cost of the adaptive algorithm.
|
||||
|
||||
#### Changed
|
||||
|
||||
* rocBLAS is a requirement when it's requested when building from source. Previously, rocBLAS was not used if it could not be found. To opt out of using rocBLAS when building from source, use the `--no-rocblas` option with the `install.sh` or `rmake.py` build scripts.
|
||||
|
||||
#### Optimized
|
||||
* Significantly improved the `rocsparse_sddmm` routine when using CSR format, especially as the number of columns in the dense `A` matrix (or rows in the dense `B` matrix) increases.
|
||||
* Improved the user documentation.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed the `rmake.py` build script to properly handle `auto` and all options when selecting offload targets.
|
||||
* Fixed an issue when building rocSPARSE with the install script on some operating systems.
|
||||
* Fixed `std::fma` casting in host routines to properly deduce types. This could have previously caused compilation failures when building from source.
|
||||
|
||||
### **rocThrust** (4.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `thrust::unique_ptr` - a smart pointer for managing device memory with automatic cleanup.
|
||||
* A new cmake option, `BUILD_OFFLOAD_COMPRESS`. When rocThrust is built with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful when compiling for a large number of targets, because it often results in a larger binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default.
|
||||
* Experimental SPIR-V support.
|
||||
|
||||
### **rocWMMA** (2.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Sample `perf_i8gemm` to demonstrate `int8_t` as matrix input data type.
|
||||
* Support for the gfx1150 target.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Removed unnecessary const keyword to avoid compiler warnings.
|
||||
* rocWMMA has been moved into the new rocm-libraries "monorepo" repository {fab}`github` [rocm-libraries](https://github.com/ROCm/rocm-libraries). This repository consolidates a number of separate ROCm libraries and shared components.
|
||||
* The repository migration requires a few changes to the CMake configuration of rocWMMA.
|
||||
* The repository migration required the GTest dependency to be updated to v1.16.0.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Skip invalid test configurations when using 'register file' LDS mapping.
|
||||
* Ensured transform functions in samples are only available on the device.
|
||||
|
||||
### **RPP** (2.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Pinned buffer API support for HOST and HIP.
|
||||
|
||||
#### Changed
|
||||
|
||||
* AMDClag++ compiler has moved to `${ROCM_PATH}/lib/llvm/bin`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* The `copy_param_float()` and `copy_param_uint()` mem copy helper functions have been removed as buffers now consistently use pinned/HIP memory.
|
||||
|
||||
#### Resolved issues
|
||||
* Test Suite - Error Code Capture updates.
|
||||
|
||||
## ROCm 7.1.1
|
||||
|
||||
See the [ROCm 7.1.1 release notes](https://rocm.docs.amd.com/en/docs-7.1.1/about/release-notes.html#rocm-7-1-1-release-notes)
|
||||
|
||||
1285
RELEASE.md
1285
RELEASE.md
File diff suppressed because it is too large
Load Diff
19
default.xml
19
default.xml
@@ -1,13 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.1"
|
||||
<default revision="refs/tags/rocm-7.2.0"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<!--HIP Projects-->
|
||||
@@ -25,30 +24,16 @@
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<!-- The following components have been migrated to rocm-libraries:
|
||||
hipBLAS-common hipBLAS hipBLASLt hipCUB
|
||||
hipFFT hipRAND hipSPARSE hipSPARSELt
|
||||
MIOpen rocBLAS rocFFT rocPRIM rocRAND
|
||||
rocSPARSE rocThrust Tensile -->
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<!-- The following components have been migrated to rocm-systems:
|
||||
aqlprofile clr hip hip-tests hipother
|
||||
rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute
|
||||
rocprofiler-register rocprofiler-sdk rocprofiler-systems
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
@@ -56,4 +41,4 @@
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
</manifest>
|
||||
@@ -39,6 +39,7 @@ additional licenses. Please review individual repositories for more information.
|
||||
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
||||
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
||||
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
||||
| [hipDNN](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipdnn/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipdnn/LICENSE.md) |
|
||||
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
||||
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
||||
|
||||
@@ -1,136 +1,136 @@
|
||||
ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
|
||||
,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
|
||||
,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.51.1,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
ROCm Version,7.2.0,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
|
||||
,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
|
||||
,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9.1, 2.8.0, 2.7.1","2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.8.0,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.2,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.30.0, 30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.15.0,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.5.0,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.5.0,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.5.0,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.3.0,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.8.0,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.2.0,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.2.0,3.1.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.2.0,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.2.1,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.22,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.2.0,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.2.0,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.6,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.1.0,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.2.0,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.36,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.32.0,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.2.0,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.2.0,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.2.0,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.2.0,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.2.0,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.1,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.4.0,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.3.0,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70200,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.1.0,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70200,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,22.0.0,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
||||
|
@@ -22,12 +22,12 @@ compatibility and system requirements.
|
||||
.. container:: format-big-table
|
||||
|
||||
.. csv-table::
|
||||
:header: "ROCm Version", "7.1.1", "7.1.0", "6.4.0"
|
||||
:header: "ROCm Version", "7.2.0", "7.1.1", "6.4.0"
|
||||
:stub-columns: 1
|
||||
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
|
||||
,"RHEL 10.1, 10.0, 9.7, |br| 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
|
||||
@@ -43,7 +43,7 @@ compatibility and system requirements.
|
||||
,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix:,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
|
||||
,gfx1201,gfx1201,
|
||||
,gfx1200,gfx1200,
|
||||
,gfx1101,gfx1101,
|
||||
@@ -54,12 +54,12 @@ compatibility and system requirements.
|
||||
,gfx908,gfx908,gfx908
|
||||
,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9.1, 2.8.0, 2.7.1","2.9, 2.8, 2.7","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.8.0,0.7.1,0.4.35
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.20.0
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.2,1.23.1,1.20.0
|
||||
,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
|
||||
@@ -70,70 +70,70 @@ compatibility and system requirements.
|
||||
CUB,2.8.5,2.8.5,2.5.0
|
||||
,,,
|
||||
DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.30.0, 30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.12.0
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.2.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.15.0,2.14.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,1.9.10
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.5.0,3.4.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.5.0,2.4.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.5.0,1.4.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.3.0,1.2.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.8.0,0.7.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.2.0,2.1.0,1.9.10
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,2.0.0
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.2.0,3.1.0,2.0.0
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.18
|
||||
:doc:`hipBLAS <hipblas:index>`,3.2.0,3.1.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.2.1,1.1.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.22,1.0.21,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,1.7.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.2.0,3.1.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.2.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.6,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.1.0,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.2.0,5.1.1,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.36,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.2.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.32.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.2.0,2.1.0,1.7.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`hipCUB <hipcub:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.2.0,2.0.0,1.5.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.2.0,4.1.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,6.4.0
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.2.26015,7.1.52802,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.2.0,7.1.1,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,25.3.0
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.1,26.2.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.1.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.3.0,1.1.0
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.4.0,3.3.1,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.3.0,1.2.1,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70200,2.0.70101,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.1.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70200,4.1.70101,4.1.60400
|
||||
,,,
|
||||
DEVELOPMENT TOOLS,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
|
||||
:doc:`HIPIFY <hipify:index>`,22.0.0,20.0.0,19.0.0
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.2
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
|
||||
@@ -143,22 +143,23 @@ compatibility and system requirements.
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`Flang <https://github.com/ROCm/flang>`_,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.2.26015,7.1.52802,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.2.26015,7.1.52802,6.4.43482
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#dgl_compat] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#os-compatibility] Some operating systems are supported on specific GPUs. For detailed information about operating systems supported on ROCm 7.2.0, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information about GPUs supporting ROCm 7.2.0, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
@@ -169,7 +170,7 @@ compatibility and system requirements.
|
||||
Operating systems, kernel and Glibc versions
|
||||
*********************************************
|
||||
|
||||
For detailed information on operating system supported on ROCm 7.1.1 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
For detailed information on operating system supported on ROCm 7.2.0 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -201,16 +202,16 @@ Expand for full historical view of:
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on specific GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
|
||||
.. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#verl_compat-past-60] verl is only supported on ROCm 7.0.0 and 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
|
||||
.. [#ray_compat-past-60] Ray is only supported on ROCm 7.0.0 and 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 6.4.1.
|
||||
.. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
|
||||
.. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
|
||||
.. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
|
||||
@@ -56,6 +56,9 @@ between JAX Plugin–PJRT and JAX/JAXLIB.
|
||||
* - JAX Plugin-PJRT
|
||||
- JAX/JAXLIB
|
||||
- ROCm
|
||||
* - 0.8.0
|
||||
- 0.8.0
|
||||
- 7.2.0
|
||||
* - 0.7.1
|
||||
- 0.7.1
|
||||
- 7.1.1, 7.1.0
|
||||
@@ -269,6 +272,33 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
|
||||
JAX API modules are maintained by the JAX project and is subject to change.
|
||||
Refer to the official Jax documentation for the most up-to-date information.
|
||||
|
||||
Key features and enhancements for ROCm 7.1
|
||||
===============================================================================
|
||||
|
||||
- Enabled compilation of multihost HLO runner Python bindings.
|
||||
|
||||
- Backported multihost HLO runner bindings and some related changes to
|
||||
:code:`FunctionalHloRunner`.
|
||||
|
||||
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
||||
|
||||
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
||||
|
||||
|
||||
- ROCprofiler-SDK integration:
|
||||
|
||||
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
||||
support both time-based and step-based profiling.
|
||||
|
||||
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
||||
|
||||
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
||||
rounding mode.
|
||||
|
||||
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
||||
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
||||
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
||||
|
||||
Key features and enhancements for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
|
||||
12
docs/conf.py
12
docs/conf.py
@@ -93,15 +93,15 @@ project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.1.1"
|
||||
release = "7.1.1"
|
||||
version = "7.2.0"
|
||||
release = "7.2.0"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-11-26"},
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2026-01-21"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
@@ -163,7 +163,6 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
@@ -193,11 +192,16 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm-mori-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang-mori-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
|
||||
@@ -19,117 +19,95 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
||||
:widths: 5 3 6 3
|
||||
|
||||
* - Framework
|
||||
- Installation
|
||||
- Installation guide
|
||||
- Installation options
|
||||
- GitHub
|
||||
|
||||
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
|
||||
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
|
||||
- Docker image
|
||||
- Wheels package
|
||||
- ROCm Base Docker image
|
||||
- Upstream Docker file
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
|
||||
- Docker image
|
||||
- Wheels package
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
|
||||
- Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
|
||||
- Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
|
||||
- Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-wheels-package>`__
|
||||
|
||||
- Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
|
||||
- Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
|
||||
- Docker image
|
||||
- Wheels package
|
||||
- ROCm Base Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#build-your-own-docker-image>`__
|
||||
- Docker image
|
||||
- ROCm Base Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `FlashInfer <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
* - :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>`
|
||||
- :doc:`link <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#use-a-prebuilt-docker-image-with-flashinfer-pre-installed>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#build-your-own-docker-image>`__
|
||||
- Docker image
|
||||
- ROCm Base Docker image
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/flashinfer"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
@@ -44,7 +44,7 @@ Setting up the base implementation environment
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --showproductname
|
||||
amd-smi static --board
|
||||
|
||||
#. Check that your GPUs are available to PyTorch.
|
||||
|
||||
@@ -65,8 +65,8 @@ Setting up the base implementation environment
|
||||
|
||||
.. tip::
|
||||
|
||||
During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
|
||||
This tool helps you see shows which GPUs are involved.
|
||||
During training and inference, you can check the memory usage by running the ``amd-smi`` command in your terminal.
|
||||
This tool helps you see which GPUs are involved.
|
||||
|
||||
|
||||
.. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
|
||||
@@ -91,10 +91,10 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
|
||||
|
||||
...
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
|
||||
# Load base model to GPU memory
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model_name,
|
||||
base_model_name,
|
||||
device_map = "auto",
|
||||
trust_remote_code = True)
|
||||
...
|
||||
@@ -139,7 +139,7 @@ model fine-tuning and inference with LLMs.
|
||||
|
||||
# Install torchtune with PyTorch release 2.2.2+
|
||||
pip install torchtune
|
||||
|
||||
|
||||
# To confirm that the package is installed correctly
|
||||
tune --help
|
||||
|
||||
@@ -148,12 +148,12 @@ model fine-tuning and inference with LLMs.
|
||||
.. code-block:: shell
|
||||
|
||||
usage: tune [-h] {download,ls,cp,run,validate} ...
|
||||
|
||||
|
||||
Welcome to the TorchTune CLI!
|
||||
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
||||
|
||||
subcommands:
|
||||
{download,ls,cp,run,validate}
|
||||
|
||||
@@ -194,11 +194,11 @@ model fine-tuning and inference with LLMs.
|
||||
apply_lora_to_output: False
|
||||
lora_rank: 8
|
||||
lora_alpha: 16
|
||||
|
||||
|
||||
tokenizer:
|
||||
_component_: torchtune.models.llama2.llama2_tokenizer
|
||||
path: /tmp/Llama-2-7b-hf/tokenizer.model
|
||||
|
||||
|
||||
# Dataset and sampler
|
||||
dataset:
|
||||
_component_: torchtune.datasets.alpaca_cleaned_dataset
|
||||
|
||||
@@ -44,20 +44,19 @@ Setting up the base implementation environment
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --showproductname
|
||||
amd-smi static --board
|
||||
|
||||
Your output should look like this:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
============================ ROCm System Management Interface ============================
|
||||
====================================== Product Info ======================================
|
||||
GPU[0] : Card Series: AMD Instinct MI300X OAM
|
||||
GPU[0] : Card model: 0x74a1
|
||||
GPU[0] : Card vendor: Advanced Micro Devices, Inc. [AMD/ATI]
|
||||
GPU[0] : Card SKU: MI3SRIOV
|
||||
==========================================================================================
|
||||
================================== End of ROCm SMI Log ===================================
|
||||
GPU: 0
|
||||
BOARD:
|
||||
MODEL_NUMBER: 102-G39203-0B
|
||||
PRODUCT_SERIAL: PCB079220-1150
|
||||
FRU_ID: 113-AMDG392030B04-100-300000097H
|
||||
PRODUCT_NAME: AMD Instinct MI325 OAM
|
||||
MANUFACTURER_NAME: AMD
|
||||
|
||||
#. Check that your GPUs are available to PyTorch.
|
||||
|
||||
@@ -94,13 +93,13 @@ Setting up the base implementation environment
|
||||
pip install -r requirements-dev.txt
|
||||
cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
|
||||
python setup.py install
|
||||
|
||||
|
||||
# To leverage the SFTTrainer in TRL for model fine-tuning.
|
||||
pip install trl
|
||||
|
||||
|
||||
# To leverage PEFT for efficiently adapting pre-trained language models .
|
||||
pip install peft
|
||||
|
||||
|
||||
# Install the other dependencies.
|
||||
pip install transformers datasets huggingface-hub scipy
|
||||
|
||||
@@ -132,7 +131,7 @@ Download the base model and fine-tuning dataset
|
||||
|
||||
.. note::
|
||||
|
||||
You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
|
||||
You can also use the `NousResearch Llama-2-7b-chat-hf <https://huggingface.co/NousResearch/Llama-2-7b-chat-hf>`_
|
||||
as a substitute. It has the same model weights as the original.
|
||||
|
||||
#. Run the following code to load the base model and tokenizer.
|
||||
@@ -141,14 +140,14 @@ Download the base model and fine-tuning dataset
|
||||
|
||||
# Base model and tokenizer names.
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
|
||||
# Load base model to GPU memory.
|
||||
device = "cuda:0"
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code = True).to(device)
|
||||
|
||||
|
||||
# Load tokenizer.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
base_model_name,
|
||||
base_model_name,
|
||||
trust_remote_code = True)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = "right"
|
||||
@@ -162,10 +161,10 @@ Download the base model and fine-tuning dataset
|
||||
# Dataset for fine-tuning.
|
||||
training_dataset_name = "mlabonne/guanaco-llama2-1k"
|
||||
training_dataset = load_dataset(training_dataset_name, split = "train")
|
||||
|
||||
|
||||
# Check the data.
|
||||
print(training_dataset)
|
||||
|
||||
|
||||
# Dataset 11 is a QA sample in English.
|
||||
print(training_dataset[11])
|
||||
|
||||
@@ -252,8 +251,8 @@ Compare the number of trainable parameters and training time under the two diffe
|
||||
dataset_text_field = "text",
|
||||
tokenizer = tokenizer,
|
||||
args = training_arguments
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
# Run the trainer.
|
||||
sft_trainer.train()
|
||||
|
||||
@@ -286,7 +285,7 @@ Compare the number of trainable parameters and training time under the two diffe
|
||||
if param.requires_grad:
|
||||
trainable_params += param.numel()
|
||||
print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
|
||||
|
||||
|
||||
sft_trainer.peft_config = None
|
||||
print_trainable_parameters(sft_trainer.model)
|
||||
|
||||
@@ -309,8 +308,8 @@ Compare the number of trainable parameters and training time under the two diffe
|
||||
dataset_text_field = "text",
|
||||
tokenizer = tokenizer,
|
||||
args = training_arguments
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
# Training.
|
||||
trainer_full.train()
|
||||
|
||||
@@ -349,7 +348,7 @@ store, and load.
|
||||
|
||||
# PEFT adapter name.
|
||||
adapter_name = "llama-2-7b-enhanced-adapter"
|
||||
|
||||
|
||||
# Save PEFT adapter.
|
||||
sft_trainer.model.save_pretrained(adapter_name)
|
||||
|
||||
@@ -359,21 +358,21 @@ store, and load.
|
||||
|
||||
# Access adapter directory.
|
||||
cd llama-2-7b-enhanced-adapter
|
||||
|
||||
|
||||
# List all adapter files.
|
||||
README.md adapter_config.json adapter_model.safetensors
|
||||
|
||||
.. tab-item:: Saving a fully fine-tuned model
|
||||
:sync: without
|
||||
|
||||
If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
|
||||
If you're not using LoRA and PEFT so there is no PEFT LoRA configuration used for training, use the following code
|
||||
to save your fine-tuned model to your system.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Fully fine-tuned model name.
|
||||
new_model_name = "llama-2-7b-enhanced"
|
||||
|
||||
|
||||
# Save the fully fine-tuned model.
|
||||
full_trainer.model.save_pretrained(new_model_name)
|
||||
|
||||
@@ -383,7 +382,7 @@ store, and load.
|
||||
|
||||
# Access new model directory.
|
||||
cd llama-2-7b-enhanced
|
||||
|
||||
|
||||
# List all model files.
|
||||
config.json model-00002-of-00006.safetensors model-00005-of-00006.safetensors
|
||||
generation_config.json model-00003-of-00006.safetensors model-00006-of-00006.safetensors
|
||||
@@ -412,26 +411,26 @@ Let's look at achieving model inference using these types of models.
|
||||
|
||||
.. tab-item:: Inference using PEFT adapters
|
||||
|
||||
To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
|
||||
To use PEFT adapters like a normal transformer model, you can run the generation by loading a base model along with PEFT
|
||||
adapters as follows.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from peft import PeftModel
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
|
||||
# Set the path of the model or the name on Hugging face hub
|
||||
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
|
||||
# Set the path of the adapter
|
||||
adapter_name = "Llama-2-7b-enhanced-adpater"
|
||||
|
||||
# Load base model
|
||||
|
||||
# Load base model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||
|
||||
|
||||
# Then, run generation as the same with a normal model outlined in 2.1
|
||||
|
||||
The PEFT library provides a ``merge_and_unload`` method, which merges the adapter layers into the base model. This is
|
||||
@@ -439,13 +438,13 @@ Let's look at achieving model inference using these types of models.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Load base model
|
||||
# Load base model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
|
||||
# Adapt the base model with the adapter
|
||||
new_model = PeftModel.from_pretrained(base_model, adapter_name)
|
||||
|
||||
# Merge adapter
|
||||
|
||||
# Merge adapter
|
||||
model = model.merge_and_unload()
|
||||
|
||||
# Save the merged model into local
|
||||
@@ -461,25 +460,25 @@ Let's look at achieving model inference using these types of models.
|
||||
|
||||
# Import relevant class for loading model and tokenizer
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
|
||||
# Set the pre-trained model name on Hugging face hub
|
||||
model_name = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Set device type
|
||||
|
||||
# Set device type
|
||||
device = "cuda:0"
|
||||
|
||||
# Load model and tokenizer
|
||||
|
||||
# Load model and tokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Input prompt encoding
|
||||
|
||||
# Input prompt encoding
|
||||
query = "What is a large language model?"
|
||||
inputs = tokenizer.encode(query, return_tensors="pt").to(device)
|
||||
|
||||
# Token generation
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Outputs decoding
|
||||
|
||||
# Token generation
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Outputs decoding
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
|
||||
In addition, pipelines from Transformers offer simple APIs to use pre-trained models for different tasks, including
|
||||
@@ -490,14 +489,14 @@ Let's look at achieving model inference using these types of models.
|
||||
|
||||
# Import relevant class for loading model and tokenizer
|
||||
from transformers import pipeline
|
||||
|
||||
|
||||
# Set the path of your model or the name on Hugging face hub
|
||||
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
|
||||
|
||||
# Set pipeline
|
||||
|
||||
# Set pipeline
|
||||
# A positive device value will run the model on associated CUDA device id
|
||||
pipe = pipeline("text-generation", model=model_name_or_path, device=0)
|
||||
|
||||
|
||||
# Token generation
|
||||
print(pipe("What is a large language model?")[0]["generated_text"])
|
||||
|
||||
|
||||
@@ -25,6 +25,5 @@ In this guide, you'll learn how to use ROCm for AI:
|
||||
|
||||
- :doc:`Inference optimization <inference-optimization/index>`
|
||||
|
||||
|
||||
To learn about ROCm for HPC applications and scientific computing, see
|
||||
:doc:`../rocm-for-hpc/index`.
|
||||
|
||||
@@ -0,0 +1,904 @@
|
||||
# SGLang distributed inference with MoRI
|
||||
|
||||
This document provides a comprehensive guide for deploying a high-performance
|
||||
SGLang distributed inference serving environment on an AMD Instinct MI355X GPU
|
||||
cluster, utilizing the [MoRI (Modular RDMA
|
||||
Interface)](https://github.com/rocm/mori) communication backend for optimized
|
||||
inter-node collective operations. It also includes systematic instructions for
|
||||
benchmarking 1P2D (1 prefill 2 decode, 3 nodes) configurations using automated
|
||||
scripts.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
The following configuration is required to implement this setup:
|
||||
|
||||
* **Nodes:** A minimum of three GPU nodes (Virtual machines or Physical
|
||||
machines) for wide expert parallelism (EP) evaluation.
|
||||
* **GPUs** 8x AMD Instinct MI355X GPU cards per node.
|
||||
* **Networking:** 8x AMD Pensando™ Pollara 400 AI NICs per node, providing
|
||||
a dedicated 1:1 mapping between GPUs and network interfaces for optimal
|
||||
inter-node communication.
|
||||
* **Orchestration:** A Slurm cluster with at least three nodes -- one for
|
||||
prefill service and two for decode services (EP16)
|
||||
|
||||
## System configuration
|
||||
|
||||
This section outlines the infrastructure setup required to support your AMD
|
||||
Instinct MI355X cluster. It covers essential procedures for verifying software
|
||||
baselines and firmware versions, configuring the AMD Pensando Pollara 400 AI
|
||||
NICs for high-bandwidth networking, and applying thermal and Quality of Service
|
||||
(QoS) tunings to ensure a stable, lossless RDMA fabric.
|
||||
|
||||
(sglang-mori-verify-baseline)=
|
||||
|
||||
### Verify baseline software
|
||||
|
||||
The following table outlines the validated software stack. Use the provided
|
||||
shell commands to verify the environment on each node before proceeding.
|
||||
|
||||
| Component | Version | Verification command |
|
||||
| :--- | :--- | :--- |
|
||||
| **OS** | Ubuntu 22.04.5 LTS | `cat /etc/os-release` |
|
||||
| **Kernel** | 5.15.0-163-generic | `uname -r` |
|
||||
| **ROCm** | 7.1.1 | `amd-smi version` |
|
||||
| **PLDM bundle (firmware)** | 01.25.16.03 | [Verify BKC](#verify-best-known-configuration-bkc) |
|
||||
| **AI NIC Firmware** | 1.117.5.a.45 | `dkms status` |
|
||||
| **AI NIC Driver** | 25.11.1.001 | `dkms status` |
|
||||
|
||||
### Verify best known configuration (BKC)
|
||||
|
||||
The BKC defines a validated configuration of GPU firmware, baseboard firmware,
|
||||
ROCm user space components, the AMD GPU Driver, and virtualization tooling.
|
||||
These components are tested together to attain best performance and compatibility.
|
||||
|
||||
While AMD publishes the AMD GPU driver and ROCm user space components, your
|
||||
server OEM or infrastructure provider distributes the firmware packages. AMD
|
||||
supplies those firmware images (PLDM bundles), which the OEM integrates and
|
||||
distributes.
|
||||
|
||||
To verify the active BKC and IFWI (Integrated Firmware Image) versions via the
|
||||
Redfish API:
|
||||
|
||||
1. Prepare credentials: Identify your BMC IP, username, and password.
|
||||
2. Run Redfish queries: Use the following commands to check the active
|
||||
firmware inventory.
|
||||
|
||||
``` bash
|
||||
# Define BMC connection variables
|
||||
BMC_IP="<BMC_IP>"
|
||||
AUTH="<username>:<password>"
|
||||
|
||||
# Query active BKC bundle version
|
||||
curl -X GET "https://${BMC_IP}/redfish/v1/UpdateService/FirmwareInventory/bundle_active" \
|
||||
-u "${AUTH}" -k | json_pp
|
||||
|
||||
# Query active IFWI (Integrated Firmware Image)
|
||||
curl -X GET "https://${BMC_IP}/redfish/v1/UpdateService/FirmwareInventory/firmware_active" \
|
||||
-u "${AUTH}" -k | json_pp
|
||||
```
|
||||
|
||||
### Run basic system health checks
|
||||
|
||||
Before proceeding with software deployment, verify that all cluster nodes
|
||||
comply with the [MI355X Basic Health
|
||||
Checks](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi355x.html#basic-health-checks)
|
||||
Key requirements include specific kernel boot arguments, minimum system memory
|
||||
thresholds, PCIe Gen5 link stability, and so on.
|
||||
|
||||
### Install AMD Pensando Pollara 400 AI NIC drivers
|
||||
|
||||
For detailed instructions on upgrading the firmware and installing drivers for
|
||||
the AMD Pensando Pollara 400 AI NIC, refer to the [AMD Instinct System
|
||||
Acceptance
|
||||
Guide](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/nic-installation.html#amd-pensando-pollara-400-ai-nic).
|
||||
After installation, verify the active firmware version on all NICs to ensure it
|
||||
matches the software baseline. See [Verify baseline software](#verify-best-known-configuration-bkc).
|
||||
|
||||
To display the current firmware version for all AI NICs, use the following command.
|
||||
|
||||
```bash
|
||||
sudo nicctl show version firmware
|
||||
```
|
||||
|
||||
### Configure thermal management (fan speed)
|
||||
|
||||
For systems equipped with 400G optics, standard fan profiles are often
|
||||
insufficient for maintaining stable operating temperatures. To prevent thermal
|
||||
throttling or optics failure, the system fans must be set to `FullSpeed`.
|
||||
|
||||
* Requirement: A fan speed of approximately 25,000 RPM is required to maintain
|
||||
the AI NIC modules at an optimal operating temperature (~50°C).
|
||||
|
||||
* Constraint: Default profiles (typically around 4,000 RPM) and "Performance IO"
|
||||
settings (around 9,000 RPM) do not provide adequate airflow for 400G optical
|
||||
transceivers.
|
||||
|
||||
#### Configure fan speed via Redfish (Supermicro)
|
||||
|
||||
Run the following command to set the fan mode to `FullSpeed` through the BMC:
|
||||
|
||||
``` bash
|
||||
# Define BMC connection variables
|
||||
BMC_IP="<BMC_IP>"
|
||||
AUTH="<username>:<password>"
|
||||
|
||||
# Set Fan Mode to FullSpeed
|
||||
curl -X PATCH "https://${BMC_IP}/redfish/v1/Managers/1/Oem/Supermicro/FanMode" \
|
||||
-k -u "${AUTH}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Mode": "FullSpeed"}'
|
||||
```
|
||||
|
||||
### Configure your backend network (netplan)
|
||||
|
||||
Configure the backend NICs for high-bandwidth inter-node communication. Suppose
|
||||
the GPU’s eight network interface controllers (NICs) are `benic1p1` to
|
||||
`benic8p1`. Each NIC must have its own subnet that is disjoint from the others.
|
||||
Each node needs a unique IP address on each subnet. You should use the same
|
||||
final octet in each subnet for a given node. For example, one node would have
|
||||
the addresses `192.168.1.36`, `192.168.2.36`, and so on. Another node would
|
||||
have `192.168.1.37`, `192.168.2.37`, and so on. Ensure MTU is set to `9000`.
|
||||
|
||||
```{note}
|
||||
Ensure you identify the correct interface names for your system using ip link
|
||||
before applying this configuration.
|
||||
```
|
||||
|
||||
For example, your `/etc/netplan/70-backend.yaml` should look like the
|
||||
following:
|
||||
|
||||
```yaml
|
||||
network:
|
||||
ethernets:
|
||||
benic8p1:
|
||||
addresses:
|
||||
- 192.168.8.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:2a:34:08
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 108
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.8.39
|
||||
routing-policy:
|
||||
- from: 192.168.8.38
|
||||
table: 108
|
||||
set-name: benic8p1
|
||||
benic7p1:
|
||||
addresses:
|
||||
- 192.168.7.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:2b:82:40
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 107
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.7.39
|
||||
routing-policy:
|
||||
- from: 192.168.7.38
|
||||
table: 107
|
||||
set-name: benic7p1
|
||||
benic6p1:
|
||||
addresses:
|
||||
- 192.168.6.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:30:c9:30
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 106
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.6.39
|
||||
routing-policy:
|
||||
- from: 192.168.6.38
|
||||
table: 106
|
||||
set-name: benic6p1
|
||||
benic5p1:
|
||||
addresses:
|
||||
- 192.168.5.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:2a:23:40
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 105
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.5.39
|
||||
routing-policy:
|
||||
- from: 192.168.5.38
|
||||
table: 105
|
||||
set-name: benic5p1
|
||||
benic4p1:
|
||||
addresses:
|
||||
- 192.168.4.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:2d:69:60
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 104
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.4.39
|
||||
routing-policy:
|
||||
- from: 192.168.4.38
|
||||
table: 104
|
||||
set-name: benic4p1
|
||||
benic3p1:
|
||||
addresses:
|
||||
- 192.168.3.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:2a:2c:40
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 103
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.3.39
|
||||
routing-policy:
|
||||
- from: 192.168.3.38
|
||||
table: 103
|
||||
set-name: benic3p1
|
||||
benic2p1:
|
||||
addresses:
|
||||
- 192.168.2.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:30:d5:30
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 102
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.2.39
|
||||
routing-policy:
|
||||
- from: 192.168.2.38
|
||||
table: 102
|
||||
set-name: benic2p1
|
||||
benic1p1:
|
||||
addresses:
|
||||
- 192.168.1.38/31
|
||||
match:
|
||||
macaddress: 04:90:81:30:e4:00
|
||||
mtu: 9000
|
||||
routes:
|
||||
- table: 101
|
||||
to: 0.0.0.0/0
|
||||
via: 192.168.1.39
|
||||
routing-policy:
|
||||
- from: 192.168.1.38
|
||||
table: 101
|
||||
set-name: benic1p1
|
||||
```
|
||||
|
||||
To apply the configuration, use the following command.
|
||||
|
||||
```bash
|
||||
sudo netplan apply
|
||||
```
|
||||
|
||||
To verify your configuration, use the following command.
|
||||
|
||||
```bash
|
||||
sudo apt install -y net-tools && ip -br a
|
||||
```
|
||||
|
||||
### Configure Quality of Service (QoS) and Congestion Control (DCQCN)
|
||||
|
||||
To ensure lossless communication and optimal performance for RDMA traffic, the
|
||||
network must be configured with specific QoS and Data Center Quantized
|
||||
Congestion Notification (DCQCN) settings.
|
||||
|
||||
The following configuration achieves:
|
||||
• It enables RX and TX Pause frames on the ports
|
||||
• Maps DSCP 24 (Data) to Q3 and DSCP 46 (CNP) to Q6, all other DSCP to Q0
|
||||
• Enables PFC for Q3
|
||||
• Scheduling : 99% to Q3, 1% to Q0 and strict priority for Q6
|
||||
|
||||
#### Configure DCQCN
|
||||
|
||||
Create and run a `/nfsdata/enable_dcqcn.sh` script to initialize congestion
|
||||
control parameters.
|
||||
|
||||
``` bash
|
||||
# !/bin/bash
|
||||
|
||||
TOKEN_BUCKET_SIZE=800000
|
||||
AI_RATE=160
|
||||
ALPHA_UPDATE_INTERVAL=1
|
||||
ALPHA_UPDATE_G=512
|
||||
INITIAL_ALPHA_VALUE=64
|
||||
RATE_INCREASE_BYTE_COUNT=431068
|
||||
HAI_RATE=300
|
||||
RATE_REDUCE_MONITOR_PERIOD=1
|
||||
RATE_INCREASE_THRESHOLD=1
|
||||
RATE_INCREASE_INTERVAL=1
|
||||
CNP_DSCP=46
|
||||
|
||||
ROCE_DEVICES=$(ibv_devices | grep ionic_ | awk '{print $1}' | paste -sd " ")
|
||||
for roce_dev in $ROCE_DEVICES
|
||||
do
|
||||
sudo nicctl update dcqcn -r $roce_dev -i 1 \
|
||||
--token-bucket-size $TOKEN_BUCKET_SIZE \
|
||||
--ai-rate $AI_RATE \
|
||||
--alpha-update-interval $ALPHA_UPDATE_INTERVAL \
|
||||
--alpha-update-g $ALPHA_UPDATE_G \
|
||||
--initial-alpha-value $INITIAL_ALPHA_VALUE \
|
||||
--rate-increase-byte-count $RATE_INCREASE_BYTE_COUNT \
|
||||
--hai-rate $HAI_RATE \
|
||||
--rate-reduce-monitor-period $RATE_REDUCE_MONITOR_PERIOD \
|
||||
--rate-increase-threshold $RATE_INCREASE_THRESHOLD \
|
||||
--rate-increase-interval $RATE_INCREASE_INTERVAL \
|
||||
--cnp-dscp $CNP_DSCP
|
||||
done
|
||||
```
|
||||
|
||||
#### Configure QoS and PFC
|
||||
|
||||
Create and run `/nfsdata/qos.sh` to set up traffic classes and scheduling.
|
||||
|
||||
``` bash
|
||||
#!/bin/bash
|
||||
# qos.sh
|
||||
|
||||
# Enable PFC and Auto-negotiation on all ports
|
||||
for i in $(sudo nicctl show port | grep Port | awk {'print $3'}); do sudo nicctl update port -p $i --pause-type pfc --rx-pause enable --tx-pause enable; done
|
||||
for i in $(sudo nicctl show port | grep Port | awk '{print $3}'); do sudo nicctl update port --port $i --auto-neg enable; done
|
||||
|
||||
# Define Priorities
|
||||
cts_dscp=46
|
||||
cts_prio=6
|
||||
data_dscp=24
|
||||
data_prio=3
|
||||
default_prio=0
|
||||
cnp_dscp=46
|
||||
cnp_prio=6
|
||||
|
||||
sudo nicctl update qos pfc --priority 0 --no-drop disable
|
||||
sudo nicctl update qos dscp-to-purpose --dscp 48 --purpose none
|
||||
sudo nicctl update qos dscp-to-purpose --dscp 46 --purpose none
|
||||
sudo nicctl update qos --classification-type pcp
|
||||
sudo nicctl update qos --classification-type dscp
|
||||
sudo nicctl update qos dscp-to-priority --dscp 0-63 --priority 0
|
||||
sudo nicctl update qos dscp-to-priority --dscp 0-23,25-45,47-63 --priority $default_prio
|
||||
sudo nicctl update qos dscp-to-priority --dscp $cts_dscp --priority $cts_prio
|
||||
sudo nicctl update qos dscp-to-priority --dscp $data_dscp --priority $data_prio
|
||||
sudo nicctl update qos dscp-to-priority --dscp $cnp_dscp --priority $cnp_prio
|
||||
sudo nicctl update qos pfc --priority $data_prio --no-drop enable
|
||||
sudo nicctl update qos scheduling --priority $data_prio,$default_prio,$cts_prio --dwrr 99,1,0 --rate-limit 0,0,10
|
||||
```
|
||||
|
||||
#### Verification your configuration
|
||||
|
||||
Verify the configuration using `nicctl`.
|
||||
|
||||
* Verify QoS classification:
|
||||
|
||||
``` bash
|
||||
sudo nicctl show qos
|
||||
```
|
||||
|
||||
Expected QoS output:
|
||||
|
||||
``` bash
|
||||
NIC : 42424650-4c32-3531-3230-303443000000 (0000:f6:00.0)
|
||||
|
||||
Port : 04908130-a7a0-4242-4242-000011010000
|
||||
|
||||
Classification type : DSCP
|
||||
|
||||
DSCP-to-priority :
|
||||
DSCP bitmap : 0xffffbffffeffffff ==> priority : 0
|
||||
DSCP bitmap : 0x0000000001000000 ==> priority : 3
|
||||
DSCP bitmap : 0x0000400000000000 ==> priority : 6
|
||||
DSCP : 0-23, 25-45, 47-63 ==> priority : 0
|
||||
DSCP : 24 ==> priority : 3
|
||||
DSCP : 46 ==> priority : 6
|
||||
```
|
||||
|
||||
* Verify DCQCN and scheduling:
|
||||
|
||||
``` bash
|
||||
sudo nicctl show dcqcn
|
||||
```
|
||||
|
||||
Expected DCQCN and scheduling output:
|
||||
|
||||
``` bash
|
||||
NIC : 42424650-4c32-3531-3230-303443000000 (0000:f6:00.0)
|
||||
------------------------------------------------------------------------------------------
|
||||
|
||||
Lif id : 43000070-0100-0000-4242-04908130a7a0
|
||||
ROCE device : ionic_7
|
||||
DCQCN profile id : 1
|
||||
Status : Enabled
|
||||
Rate increase in AI phase : 160
|
||||
Rate increase byte count : 431068
|
||||
Alpha update G value : 512
|
||||
Alpha update interval : 1
|
||||
Rate increase in HAI phase : 300
|
||||
Initial alpha value : 64
|
||||
Rate reduce monitor period : 1
|
||||
Rate increase threshold : 1
|
||||
Rate increase interval : 1
|
||||
Token bucket size : 800000
|
||||
DSCP value used for CNP : 46
|
||||
|
||||
|
||||
PFC :
|
||||
PFC priority bitmap : 0x8
|
||||
PFC no-drop priorities : 3
|
||||
|
||||
Scheduling :
|
||||
--------------------------------------------
|
||||
Priority Scheduling Bandwidth Rate-limit
|
||||
Type (in %age) (in Gbps)
|
||||
--------------------------------------------
|
||||
0 DWRR 1 N/A
|
||||
3 DWRR 99 N/A
|
||||
6 strict N/A 10
|
||||
```
|
||||
|
||||
### Configure your network file system (NFS)
|
||||
|
||||
Setting up a shared NFS volume facilitates centralized storage for models,
|
||||
recipes, and logs across the cluster. Use the following commands to install the
|
||||
necessary client tools and mount the remote directory.
|
||||
|
||||
```{important}
|
||||
Replace `nfs_server_ip:/shared/folder` and `/mount/point` with your specific
|
||||
server details and desired local mount path.
|
||||
```
|
||||
|
||||
``` bash
|
||||
sudo apt update && sudo apt install -y nfs-common
|
||||
sudo mkdir -p /mount/point
|
||||
sudo mount -t nfs nfs_server_ip:/shared/folder /mount/point
|
||||
echo "nfs_server_ip:/shared/folder /mount/point nfs _netdev,nofail,x-systemd.automount,x-systemd.idle-timeout=600,vers=4.2 0 0" | sudo tee -a /etc/fstab
|
||||
```
|
||||
|
||||
## Software installation
|
||||
|
||||
Next, install the core compute stack required to operate the AMD Instinct GPUs.
|
||||
The following steps guide you through deploying the ROCm software stack and the
|
||||
necessary kernel-mode drivers to enable hardware acceleration and optimize the
|
||||
environment for distributed inference workloads.
|
||||
|
||||
### Install ROCm
|
||||
|
||||
Use the following commands to quickly install ROCm 7.1.1 on Ubuntu 22.04:
|
||||
|
||||
``` bash
|
||||
wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/jammy/amdgpu-install_7.1.1.70101-1_all.deb
|
||||
sudo apt install ./amdgpu-install_7.1.1.70101-1_all.deb
|
||||
sudo apt update
|
||||
sudo apt install python3-setuptools python3-wheel
|
||||
sudo usermod -a -G render,video $LOGNAME # Add the current user to the render and video groups
|
||||
sudo apt install rocm
|
||||
```
|
||||
|
||||
For detailed installation instructions, refer to the [ROCm 7.1.1
|
||||
documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/install/quick-start.html#rocm-installation).
|
||||
|
||||
### Install AMD GPU Driver (amdgpu)
|
||||
|
||||
Use the following commands to quickly install the AMD GPU Driver (ROCm 7.1.1)
|
||||
on Ubuntu 22.04:
|
||||
|
||||
``` bash
|
||||
wget https://repo.radeon.com/amdgpu-install/7.1.1/ubuntu/jammy/amdgpu-install_7.1.1.70101-1_all.deb
|
||||
sudo apt install ./amdgpu-install_7.1.1.70101-1_all.deb
|
||||
sudo apt update
|
||||
sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
|
||||
sudo apt install amdgpu-dkms
|
||||
```
|
||||
|
||||
For detailed installation instructions, refer to the [ROCm 7.1.1
|
||||
documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/install/quick-start.html#amdgpu-driver-installation).
|
||||
|
||||
## Network verification and testing
|
||||
|
||||
Before deploying the inference engine, validate the health and performance of
|
||||
the cluster interconnects.
|
||||
|
||||
### Verify network connectivity
|
||||
|
||||
Verify that all network interfaces are reachable across the cluster nodes.
|
||||
Assuming `eth0` is the management interface, and `benic1p1` through `benic8p1` are the
|
||||
dedicated RoCE backend interfaces, use the following loop to test reachability
|
||||
to a remote node (for instance, a target node with host IP suffix `.38`).
|
||||
|
||||
```bash
|
||||
# Test connectivity for RoCE subnets 192.168.x.38 (node B) through 192.168.x.37 (node A)
|
||||
for i in {1..8}; do ping -c 1 192.168.${i}.38; done
|
||||
```
|
||||
|
||||
### Validate your RDMA setup
|
||||
|
||||
Confirm that all eight RDMA network interfaces are in the `UP` state and
|
||||
correctly configured with the required MTU and GID settings.
|
||||
|
||||
#### Verify link status MTU, NIC temperature, and NIC speed
|
||||
|
||||
```bash
|
||||
sudo nicctl show port
|
||||
```
|
||||
|
||||
The output should look something like this:
|
||||
|
||||
```bash
|
||||
-------------------------------------------------------------------------------------
|
||||
|
||||
NIC : 42424650-4c32-3531-3530-314343000000 (0000:f6:00.0)
|
||||
|
||||
Port : 04908132-5d88-4242-4242-000011010000 (eth1/1)
|
||||
Spec:
|
||||
Ifindex : 0x11010000
|
||||
Type : ETH
|
||||
speed : 400G
|
||||
Admin state : UP
|
||||
FEC type : RS
|
||||
Pause type : PFC
|
||||
Number of lanes : 4
|
||||
MTU : 9216
|
||||
TX pause : enabled
|
||||
RX pause : enabled
|
||||
Auto negotiation : enabled
|
||||
Status:
|
||||
Physical port : 1
|
||||
Operational status : UP
|
||||
Link FSM state : UP
|
||||
FEC type : RS
|
||||
Cable type : Fiber
|
||||
Number of lanes : 4
|
||||
speed : 400G
|
||||
Auto negotiation : disabled
|
||||
MAC ID : 0
|
||||
MAC channel : 0
|
||||
MAC address : 04:90:81:32:5d:88
|
||||
Transceiver type : QSFP_CMIS
|
||||
Transceiver state : SPROM-READ
|
||||
Transceiver PID : QSFP-400G-DR4
|
||||
Transceiver temperature (in C) : 45
|
||||
Transceiver warning temperature (in C) : 75
|
||||
Transceiver alarm temperature (in C) : 80
|
||||
-------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
#### Verify GID
|
||||
|
||||
Ensure each device has a valid GID mapped to its assigned IP address.
|
||||
|
||||
```bash
|
||||
ibv_devinfo -v | grep GID
|
||||
```
|
||||
|
||||
The output should look something like this:
|
||||
|
||||
```bash
|
||||
GID[ 0]: fe80::690:81ff:fe30:a7a0, RoCE v2
|
||||
GID[ 1]: ::ffff:192.168.7.36, RoCE v2
|
||||
```
|
||||
|
||||
### Run RDMA bandwidth benchmarks
|
||||
|
||||
Verify the inter-node RDMA performance to ensure the network fabric can
|
||||
saturate the link bandwidth.
|
||||
|
||||
#### Install RDMA Performance Tools
|
||||
|
||||
To get started, build the ROCm-optimized `rdma-perftest` test suite from
|
||||
source:
|
||||
|
||||
```bash
|
||||
sudo apt install -y libibumad-dev libpci-dev libibverbs-dev librdmacm-dev ibverbs-utils libtool
|
||||
git clone https://github.com/ROCm/rdma-perftest
|
||||
cd rdma-perftest/
|
||||
./autogen.sh
|
||||
./configure --enable-rocm --with-rocm=/opt/rocm
|
||||
make -j$(nproc)
|
||||
sudo make install
|
||||
```
|
||||
|
||||
#### Run a bandwidth test (GPU memory)
|
||||
|
||||
Perform a bandwidth test using ROCm GPU memory between two nodes. One acts as
|
||||
a server and the other acts as a client. Replace `<SERVER_IP>` with the
|
||||
appropriate IP.
|
||||
|
||||
```bash
|
||||
# On Server Node
|
||||
./ib_write_bw --use_rocm=0 -d mlx5_0 --report_gbits -a
|
||||
|
||||
# On Client Node
|
||||
./ib_write_bw --use_rocm=0 -d mlx5_0 --report_gbits -a <SERVER_IP>
|
||||
```
|
||||
|
||||
## SGLang serving and MoRI unit tests
|
||||
|
||||
### Install Docker Engine
|
||||
|
||||
Install the Docker engine to manage the containerized vLLM and MoRI serving
|
||||
environments.
|
||||
|
||||
```bash
|
||||
sudo apt update && sudo apt install -y docker.io
|
||||
sudo usermod -aG docker "$USER"
|
||||
```
|
||||
|
||||
### Launch the serving container
|
||||
|
||||
Deploy the SGLang MoRI serving container on each node.
|
||||
|
||||
```bash
|
||||
CONTAINER_NAME=sglang_mori
|
||||
IMAGE_NAME=rocm/sgl-dev:sglang-0.5.6.post1-rocm700-mi35x-mori-0113
|
||||
|
||||
docker run -it \
|
||||
--rm \
|
||||
--device /dev/dri --device /dev/kfd --device=/dev/infiniBand \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name ${CONTAINER_NAME} \
|
||||
${IMAGE_NAME} /bin/bash
|
||||
```
|
||||
|
||||
### Run MoRI inter-node unit tests
|
||||
|
||||
Before starting the vLLM service, run the MoRI unit test to verify that the
|
||||
inter-node communication backend is correctly configured.
|
||||
|
||||
MoRI unit test uses 2 nodes as a minimal validation before running the full
|
||||
1P2D (3 nodes) benchmark.
|
||||
|
||||
The key configuration variables are:
|
||||
|
||||
* `GLOO_SOCKET_IFNAME`: The network interface used for backend initialization such as `eth2`.
|
||||
* `<MASTER_IP>`: The IP address of the primary node's backend interface.
|
||||
|
||||
```{note}
|
||||
You can find reference performance data in the [ROCm/MoRI
|
||||
repository](https://github.com/ROCm/mori?tab=readme-ov-file#mori-ep).
|
||||
```
|
||||
|
||||
```bash
|
||||
# Set up environment inside the container
|
||||
export PYTHONPATH=/app/mori:$PYTHONPATH
|
||||
export GLOO_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
|
||||
# Node 0 (Primary)
|
||||
torchrun --nnodes=2 --node_rank=0 --nproc_per_node=1 \
|
||||
--master_addr="<MASTER_IP>" --master_port=1234 \
|
||||
examples/ops/dispatch_combine/test_dispatch_combine_internode.py \
|
||||
--cmd bench --kernel-type v1
|
||||
|
||||
# Node 1 (Secondary)
|
||||
torchrun --nnodes=2 --node_rank=1 --nproc_per_node=1 \
|
||||
--master_addr="<MASTER_IP>" --master_port=1234 \
|
||||
examples/ops/dispatch_combine/test_dispatch_combine_internode.py \
|
||||
--cmd bench --kernel-type v1
|
||||
```
|
||||
|
||||
## End-to-end 1P2D performance testing
|
||||
|
||||
This section guides you through running distributed inference benchmarks using
|
||||
the SGLang disagg recipe. For detailed implementation details, refer to the
|
||||
[SGLang Disaggregation
|
||||
Recipe](https://github.com/billishyahao/sglang_disagg/blob/9n_cluster/README.md).
|
||||
|
||||
### Download the model and setup your run environment
|
||||
|
||||
This performance test supports the following models:
|
||||
|
||||
* [DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)
|
||||
* [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)
|
||||
* [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528)
|
||||
|
||||
To set up your environment and download the models using the Hugging Face CLI,
|
||||
use the following commands. Modify the `huggingface-cli download` command
|
||||
to download the desired model.
|
||||
|
||||
```bash
|
||||
# Set up a virtual environment and install the Hugging Face CLI
|
||||
sudo apt update && sudo apt install -y python3-venv
|
||||
python3 -m venv ~/venvs/hf
|
||||
source ~/venvs/hf/bin/activate
|
||||
pip install huggingface_hub
|
||||
|
||||
# Download the model to the shared NFS mount point
|
||||
# Replace 'deepseek-ai/DeepSeek-R1-0528' with your desired model
|
||||
huggingface-cli download --token <your_hf_token> \
|
||||
deepseek-ai/DeepSeek-R1-0528 \
|
||||
--local-dir /mount/point/models/DeepSeek-R1
|
||||
```
|
||||
|
||||
### Clone the SGLang disaggregation recipe
|
||||
|
||||
Clone the SGLang disaggregation repository to the shared file system and switch
|
||||
to the appropriate branch:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/billishyahao/sglang_disagg.git
|
||||
git checkout 9n_cluster
|
||||
cd sglang_disagg
|
||||
```
|
||||
|
||||
```{note}
|
||||
In the 1P2D configuration, the prefill service and benchmark process run on the
|
||||
same node, while remaining nodes handle decode services.
|
||||
```
|
||||
|
||||
### Configure InfiniBand devices
|
||||
|
||||
Identify and configure the available InfiniBand devices.
|
||||
|
||||
1. List available devices using the following command.
|
||||
|
||||
```bash
|
||||
ibv_devinfo -l
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```bash
|
||||
8 HCAs found:
|
||||
ionic_0
|
||||
ionic_1
|
||||
ionic_2
|
||||
ionic_3
|
||||
ionic_4
|
||||
ionic_5
|
||||
ionic_6
|
||||
ionic_7
|
||||
```
|
||||
|
||||
2. Update environment variables. Edit `set_env_vars.sh` and add the
|
||||
comma-separated list of your system's IB devices. For example:
|
||||
|
||||
```bash
|
||||
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
|
||||
```
|
||||
|
||||
### Configure the script and submit the job
|
||||
|
||||
1. To set the required configuration parameters, update the following
|
||||
environment variables in `run_submit_disagg.sh` to match your cluster setup:
|
||||
|
||||
```bash
|
||||
# SLURM Job Configuration
|
||||
export SLURM_ACCOUNT="amd" # The account name for SLURM job accounting and resource allocation
|
||||
export SLURM_PARTITION="compute" # The specific cluster partition (queue) to submit the job to
|
||||
export TIME_LIMIT="24:00:00" # Maximum wall time for the job (Hours:Minutes:Seconds)
|
||||
|
||||
# Model Configuration
|
||||
export MODEL_PATH="/nfsdata" # Base directory where the model weights are stored
|
||||
export MODEL_NAME="DeepSeek-R1" # Specific model directory name (joined with MODEL_PATH)
|
||||
export CONTAINER_IMAGE="rocm/sgl-dev:sglang-0.5.6.post1-rocm700-mi35x-mori-1224" # Docker image to use for the environment
|
||||
|
||||
# Cluster Topology (Disaggregation Setup)
|
||||
export PREFILL_NODES=1 # Number of prefill nodes
|
||||
export PREFILL_WORKERS=1 # Number of prefill workers
|
||||
export DECODE_NODES=2 # Number of decode nodes
|
||||
export DECODE_WORKERS=2 # Number of decode workers
|
||||
|
||||
# Benchmark/Workload Parameters
|
||||
export ISL=1024 # Input Sequence Length (number of tokens in the prompt)
|
||||
export OSL=1024 # Output Sequence Length (number of tokens to generate)
|
||||
export CONCURRENCIES="2048" # Total number of concurrent requests to simulate in the benchmark. The value can be "32,64,128"
|
||||
export REQUEST_RATE="inf" # Request per second rate. "inf" means send all requests immediately
|
||||
|
||||
# Parallelism Strategies
|
||||
export PREFILL_ENABLE_EP=true # Enable Expert Parallelism (EP) for the prefill phase
|
||||
export PREFILL_ENABLE_DP=true # Enable Data Parallelism (DP) for the prefill phase
|
||||
export DECODE_ENABLE_EP=true # Enable Expert Parallelism (EP) for the decode phase
|
||||
export DECODE_ENABLE_DP=true # Enable Data Parallelism (DP) for the decode phase
|
||||
```
|
||||
|
||||
2. Then submit the batch job into slurm cluster through `bash ./run_submit_disagg.sh`.
|
||||
|
||||
```bash
|
||||
bash ./run_submit_disagg.sh
|
||||
```
|
||||
|
||||
### Log file analysis
|
||||
|
||||
1. After submission, retrieve the SLURM job ID:
|
||||
|
||||
```bash
|
||||
squeue
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```bash
|
||||
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
|
||||
123 compute 1p2d alice R 00:10:32 4 node[01-04]
|
||||
```
|
||||
|
||||
2. A directory named `slurm_job-$SLURM_JOB_ID` is created in `/tmp` on each
|
||||
participating node. The directory contains:
|
||||
|
||||
| Log File | Description |
|
||||
| :--------| :-----------|
|
||||
| `pd_sglang_bench_serving.sh_NODE${NODE_RANK}.log` | Main service log per node |
|
||||
| `decode_NODE${NODE_RANK}.log` | SGLang decode service details |
|
||||
| `prefill_NODE${NODE_RANK}.log` | SGLang prefill service details |
|
||||
|
||||
3. The benchmark results will be displayed in
|
||||
`pd_sglang_bench_serving.sh_NODE${NODE_RANK}.log`. Key metrics include:
|
||||
|
||||
```{note}
|
||||
The following benchmark utility output is provided for reference only and
|
||||
should not be used to compare performance. See the
|
||||
[InferenceMAX](https://inferencemax.semianalysis.com/) website for validated
|
||||
performance results.
|
||||
```
|
||||
|
||||
``` bash
|
||||
============ Serving Benchmark Result ============
|
||||
Successful requests: 20480
|
||||
Benchmark duration (s): 1194.25
|
||||
Total input tokens: 20971520
|
||||
Total generated tokens: 20971520
|
||||
Request throughput (req/s): 17.15
|
||||
Output token throughput (tok/s): 17560.38
|
||||
Total Token throughput (tok/s): 35120.76
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 21601.77
|
||||
Median TTFT (ms): 24525.21
|
||||
P99 TTFT (ms): 85417.53
|
||||
-----Time per Output Token (excl. 1st token)------
|
||||
Mean TPOT (ms): 92.41
|
||||
Median TPOT (ms): 85.46
|
||||
P99 TPOT (ms): 138.67
|
||||
---------------Inter-token Latency----------------
|
||||
Mean ITL (ms): 92.41
|
||||
Median ITL (ms): 74.76
|
||||
P99 ITL (ms): 263.07
|
||||
----------------End-to-end Latency----------------
|
||||
Mean E2EL (ms): 116133.48
|
||||
Median E2EL (ms): 110349.39
|
||||
P99 E2EL (ms): 227243.97
|
||||
==================================================
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
The following section outlines common issues and their solutions.
|
||||
|
||||
### Bandwidth test fails with error
|
||||
|
||||
1. Use ROCm-optimized `rdma-perftest`, not the generic `perftest`
|
||||
|
||||
```bash
|
||||
which ib_write_bw
|
||||
```
|
||||
|
||||
2. Confirm the `SERVER_IP` is accessible
|
||||
|
||||
```bash
|
||||
ping <SERVER_IP>
|
||||
```
|
||||
|
||||
3. Check system logs, use `dmesg` for kernel-level errors
|
||||
|
||||
``` bash
|
||||
sudo dmesg -T | grep -i 'error|warn|fail|exception'
|
||||
```
|
||||
|
||||
### Slurm job fails
|
||||
|
||||
Common causes and solutions for Slurm job submission failures include:
|
||||
|
||||
1. Shared storage access:
|
||||
* Verify that both `sglang_disagg` and model directories are located in a shared NFS mount accessible to all compute nodes.
|
||||
* Ensure proper permissions: `chmod -R 755 /shared/path/sglang_disagg /shared/path/models`
|
||||
|
||||
2. Log analysis:
|
||||
* Examine `pd_sglang_bench_serving.sh_NODE${NODE_RANK}.log` on each participating node for detailed error messages.
|
||||
* Check for common issues like missing dependencies, GPU allocation failures, or network connectivity problems.
|
||||
|
||||
3. Configuration validation:
|
||||
* Verify SLURM parameters in `run_submit_disagg.sh`:
|
||||
* `SLURM_ACCOUNT`: Ensure your account has access to the cluster
|
||||
* `SLURM_PARTITION`: Confirm the partition exists and is accessible
|
||||
* `MODEL_PATH`: Check that the path is correct and accessible from compute nodes
|
||||
* `MODEL_NAME`: Verify the model subdirectory exists within `MODEL_PATH`
|
||||
* Use `sinfo` to check partition and node availability.
|
||||
@@ -0,0 +1,627 @@
|
||||
# vLLM distributed inference with MoRI
|
||||
|
||||
This document provides a comprehensive guide for setting up a high-performance
|
||||
vLLM serving environment on an AMD Instinct MI300X or MI325X GPU cluster using
|
||||
the [MoRI (Modular RDMA Interface)](https://github.com/rocm/mori) communication
|
||||
backend. It also includes detailed instructions on how to reproduce the
|
||||
benchmark results published in the AMD ROCm blog [Practical, Fault-Robust
|
||||
Distributed Inference for DeepSeek on AMD
|
||||
MI300X](https://rocm.blogs.amd.com/software-tools-optimization/wide-ep-deepseek/README.html).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
The following hardware configuration is required to implement this setup:
|
||||
|
||||
* **Nodes**: A minimum of two GPU nodes (virtual machines or physical machines)
|
||||
for wide expert parallelism (EP) evaluation.
|
||||
* **GPUs**: 8x AMD Instinct MI300X/MI325X GPU cards per node.
|
||||
* **Networking**: 8x NVIDIA Mellanox ConnectX-7 (CX7) NICs per node, providing
|
||||
a dedicated 1:1 mapping between GPUs and network interfaces for optimal
|
||||
inter-node communication.
|
||||
|
||||
## System configuration
|
||||
|
||||
This section outlines infrastructure steps required to prepare your cluster for
|
||||
high-performance AI workloads. It covers validating your system's software
|
||||
baselines and firmware versions, configuring high-bandwidth backend networking
|
||||
for inter-node communication, and establish shared storage to ensure
|
||||
a synchronized distributed computing environment.
|
||||
|
||||
### Verify baseline software
|
||||
|
||||
This setup has been validated using the **AI/ML Ready Image (ROCm 7-based)** on
|
||||
Digital Ocean AMD GPU Droplets. The following table outlines the software
|
||||
stack versions and appropriate shell commands for verification:
|
||||
|
||||
| Component | Version | Verification command |
|
||||
| :--- | :--- | :--- |
|
||||
| **OS** | Ubuntu 24.04.3 LTS | `cat /etc/os-release` |
|
||||
| **Kernel** | 6.8.0-87-generic | `uname -r` |
|
||||
| **ROCm** | 7.0.2 | `amd-smi version` |
|
||||
| **PLDM bundle (firmware) for MI300X** | 01.25.03.12 | [Verify BKC](#verify-best-known-configuration-bkc) |
|
||||
| **PLDM bundle (firmware) for MI325X** | 01.25.03.03 | [Verify BKC](#verify-best-known-configuration-bkc) |
|
||||
| **CX7 Firmware** | 28.46.3048 | `dkms status` |
|
||||
| **CX7 Driver** | 24.10-3.2.5 | `dkms status` |
|
||||
| **DOCA** | 2.9.3 | `dpkg -l \| grep doca` |
|
||||
|
||||
### Verify best known configuration (BKC)
|
||||
|
||||
The BKC defines a validated configuration of GPU firmware, baseboard firmware,
|
||||
ROCm user space components, the AMD GPU Driver, and virtualization tooling.
|
||||
These components are tested together to attain best performance and compatibility.
|
||||
|
||||
While AMD publishes the AMD GPU driver and ROCm user space components, your
|
||||
server OEM or infrastructure provider distributes the firmware packages. AMD
|
||||
supplies those firmware images (PLDM bundles), which the OEM integrates and
|
||||
distributes.
|
||||
|
||||
To verify the active BKC and IFWI (Integrated Firmware Image) versions via the
|
||||
Redfish API:
|
||||
|
||||
1. Prepare credentials: Identify your BMC IP, username, and password.
|
||||
2. Run Redfish queries: Use the following commands to check the active
|
||||
firmware inventory.
|
||||
|
||||
``` bash
|
||||
# Define BMC connection variables
|
||||
BMC_IP="<BMC_IP>"
|
||||
AUTH="<username>:<password>"
|
||||
|
||||
# Query active BKC bundle version
|
||||
curl -X GET "https://${BMC_IP}/redfish/v1/UpdateService/FirmwareInventory/bundle_active" \
|
||||
-u "${AUTH}" -k | json_pp
|
||||
|
||||
# Query active IFWI (Integrated Firmware Image)
|
||||
curl -X GET "https://${BMC_IP}/redfish/v1/UpdateService/FirmwareInventory/firmware_active" \
|
||||
-u "${AUTH}" -k | json_pp
|
||||
```
|
||||
|
||||
### Run basic system health checks
|
||||
|
||||
Before proceeding with software deployment, verify that all cluster nodes
|
||||
comply with the [MI300X Basic Health
|
||||
Checks](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi300x.html#basic-health-checks)
|
||||
or [MI325X Basic Health
|
||||
Checks](https://instinct.docs.amd.com/projects/system-acceptance/en/latest/gpus/mi325x.html#basic-health-checks).
|
||||
Key requirements include specific kernel boot arguments, minimum system memory
|
||||
thresholds, PCIe Gen5 link stability, and so on.
|
||||
|
||||
### Configure your backend network (netplan)
|
||||
|
||||
Configure the backend NICs for high-bandwidth inter-node communication. Suppose
|
||||
the GPU’s eight network interface controllers (NICs) are eth2 to eth9. Each NIC
|
||||
must have its own subnet that is disjoint from the others. For example, `eth2`
|
||||
could use `192.168.50.0/24`, `eth3` could use `192.168.51.0/24`, and so on.
|
||||
Each node needs a unique IP address on each subnet. You should use the same
|
||||
final octet in each subnet for a given node. For example, one node would have
|
||||
the addresses `192.168.50.2`, `192.168.51.2`, and so on. Another node might
|
||||
have `192.168.50.3`, `192.168.51.3`, and so on. Ensure MTU is set to `4200`.
|
||||
|
||||
```{note}
|
||||
Ensure you identify the correct interface names for your system using ip link
|
||||
before applying this configuration.
|
||||
```
|
||||
|
||||
For example, your `/etc/netplan/50-backend.yaml` might include something like
|
||||
the following:
|
||||
|
||||
```yaml
|
||||
eth2:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.50.2/24
|
||||
mtu: 4200
|
||||
eth3:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.51.2/24
|
||||
mtu: 4200
|
||||
eth4:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.52.2/24
|
||||
mtu: 4200
|
||||
eth5:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.53.2/24
|
||||
mtu: 4200
|
||||
eth6:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.54.2/24
|
||||
mtu: 4200
|
||||
eth7:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.55.2/24
|
||||
mtu: 4200
|
||||
eth8:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.56.2/24
|
||||
mtu: 4200
|
||||
eth9:
|
||||
dhcp4: false
|
||||
dhcp6: false
|
||||
link-local: []
|
||||
addresses:
|
||||
- 192.168.57.2/24
|
||||
mtu: 4200
|
||||
```
|
||||
|
||||
To apply the configuration, use the following command.
|
||||
|
||||
```bash
|
||||
sudo netplan apply
|
||||
```
|
||||
|
||||
To verify your configuration, use the following command.
|
||||
|
||||
```bash
|
||||
sudo apt install -y net-tools && ip -br a
|
||||
```
|
||||
|
||||
### Configure your network file system (NFS)
|
||||
|
||||
Setting up a shared NFS volume facilitates centralized storage for models,
|
||||
recipes, and logs across the cluster. Use the following commands to install the
|
||||
necessary client tools and mount the remote directory.
|
||||
|
||||
```{important}
|
||||
Replace `nfs_server_ip:/shared/folder` and `/mount/point` with your specific
|
||||
server details and desired local mount path.
|
||||
```
|
||||
|
||||
``` bash
|
||||
sudo apt update && sudo apt install -y nfs-common
|
||||
sudo mkdir -p /mount/point
|
||||
sudo mount -t nfs nfs_server_ip:/shared/folder /mount/point
|
||||
echo "nfs_server_ip:/shared/folder /mount/point nfs _netdev,nofail,x-systemd.automount,x-systemd.idle-timeout=600,vers=4.2 0 0" | sudo tee -a /etc/fstab
|
||||
```
|
||||
|
||||
### Configure static hostname resolution for backend initialization (optional)
|
||||
|
||||
If the high-speed RDMA/IB interfaces are used for the initial distributed
|
||||
coordination (such as `MASTER_ADDR`), you must configure static hostname
|
||||
resolution. This ensures that cluster host names resolve to the backend network
|
||||
IPs rather than the management or local loopback addresses.
|
||||
|
||||
Follow these steps to configure static hostname resolution:
|
||||
|
||||
1. Edit `/etc/hosts` on all nodes: for example, using `sudo vim /etc/hosts`.
|
||||
2. Add the backend IP and hostname mappings.
|
||||
3. Comment out any default local mappings (such as `127.0.1.1`) for the current
|
||||
hostname to avoid resolution conflicts.
|
||||
|
||||
For example, your `/etc/hosts` entries might look like:
|
||||
|
||||
```text
|
||||
# Map host names to backend network IPs
|
||||
192.168.50.2 mori_test_01
|
||||
192.168.50.3 mori_test_02
|
||||
|
||||
# Comment out the default entry to ensure resolution via the backend IP
|
||||
# 127.0.1.1 mori_test_01 mori_test_01
|
||||
```
|
||||
|
||||
## Software installation
|
||||
|
||||
Next, install the essential software stack required to operate the AMD Instinct
|
||||
GPUs and high-speed networking components. Follow these steps to deploy the
|
||||
NVIDIA DOCA drivers for Mellanox ConnectX-7 NICs, the ROCm software stack, and
|
||||
the necessary kernel modules to enable hardware acceleration.
|
||||
|
||||
### Install CX7 driver and firmware
|
||||
|
||||
1. Download and install the `DOCA 2.9.3` driver following the instructions in
|
||||
[NVIDIA DOCA 2.9.3
|
||||
Downloads](https://developer.nvidia.com/doca-2-9-3-download-archive?deployment_platform=Host-Server&deployment_package=DOCA-Host&target_os=Linux&Architecture=x86_64&Profile=doca-all&Distribution=Ubuntu&version=24.04&installer_type=deb_local).
|
||||
|
||||
2. Download the appropriate firmware for your hardware PSID from the [NVIDIA
|
||||
official website](https://network.nvidia.com/support/firmware/connectx7/)
|
||||
and flash the device.
|
||||
|
||||
3. To verify driver and firmware versions, use the following command. Replace
|
||||
`IB Device` with your specific backend interface.
|
||||
|
||||
```bash
|
||||
ethtool -i <IB Device>
|
||||
```
|
||||
|
||||
### Install ROCm
|
||||
|
||||
Use the following commands to quickly install ROCm 7.0.2 on Ubuntu 24.04:
|
||||
|
||||
``` bash
|
||||
wget https://repo.radeon.com/amdgpu-install/7.0.2/ubuntu/noble/amdgpu-install_7.0.2.70002-1_all.deb
|
||||
sudo apt install ./amdgpu-install_7.0.2.70002-1_all.deb
|
||||
sudo apt update
|
||||
sudo apt install python3-setuptools python3-wheel
|
||||
sudo usermod -a -G render,video $LOGNAME # Add the current user to the render and video groups
|
||||
sudo apt install rocm
|
||||
```
|
||||
|
||||
For detailed installation instructions, refer to the [ROCm 7.0.2
|
||||
documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/install/quick-start.html#rocm-installation).
|
||||
|
||||
### Install AMD GPU Driver (amdgpu)
|
||||
|
||||
Use the following commands to quickly install the AMD GPU Driver (ROCm 7.0.2) on Ubuntu 24.04:
|
||||
|
||||
``` bash
|
||||
wget https://repo.radeon.com/amdgpu-install/7.0.2/ubuntu/noble/amdgpu-install_7.0.2.70002-1_all.deb
|
||||
sudo apt install ./amdgpu-install_7.0.2.70002-1_all.deb
|
||||
sudo apt update
|
||||
sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
|
||||
sudo apt install amdgpu-dkms
|
||||
```
|
||||
|
||||
For detailed installation instructions, refer to the [ROCm 7.0.2
|
||||
documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.0.2/install/quick-start.html#amdgpu-driver-installation).
|
||||
|
||||
## Network verification and testing
|
||||
|
||||
Before deploying the inference engine, validate the health and performance of
|
||||
the cluster interconnects.
|
||||
|
||||
### Verify network connectivity
|
||||
|
||||
Verify that all network interfaces are reachable across the cluster nodes.
|
||||
Assuming `eth0` is the management interface, `eth1` is for the VPC, and `eth2`
|
||||
through `eth9` are the dedicated RoCE backend interfaces, use the following
|
||||
loop to test reachability to a remote node (for instance, a target node with
|
||||
host IP suffix `.3`).
|
||||
|
||||
```bash
|
||||
# Test connectivity for RoCE subnets 192.168.50.x through 192.168.57.x
|
||||
for i in {0..7}; do ping -c 1 192.168.5${i}.3; done
|
||||
```
|
||||
|
||||
### Validate your RDMA setup
|
||||
|
||||
Confirm that all eight RDMA network interfaces are in `UP` state. Verify the MTU
|
||||
setting of `4096` and ensure each device has a valid GID mapped to its assigned
|
||||
IP address.
|
||||
|
||||
``` bash
|
||||
ibv_devinfo -v
|
||||
```
|
||||
|
||||
The output should look something like this:
|
||||
|
||||
``` bash
|
||||
hca_id: mlx5_0
|
||||
transport: InfiniBand (0)
|
||||
fw_ver: 28.46.3048
|
||||
...
|
||||
board_id: MT_0000000838
|
||||
phys_port_cnt: 1
|
||||
port: 1
|
||||
state: PORT_ACTIVE (4)
|
||||
max_mtu: 4096 (5)
|
||||
active_mtu: 4096 (5)
|
||||
sm_lid: 0
|
||||
port_lid: 0
|
||||
port_lmc: 0x00
|
||||
link_layer: Ethernet
|
||||
...
|
||||
GID[ 0]: fe80:0000:0000:0000:d894:24ff:fe4a:96e2, RoCE v1
|
||||
GID[ 1]: fe80::d894:24ff:fe4a:96e2, RoCE v2
|
||||
GID[ 2]: 0000:0000:0000:0000:0000:ffff:c0a8:3903, RoCE v1
|
||||
GID[ 3]: ::ffff:192.168.57.3, RoCE v2
|
||||
```
|
||||
|
||||
### Run RDMA bandwidth benchmarks
|
||||
|
||||
Verify the inter-node RDMA performance to ensure the network fabric can
|
||||
saturate the link bandwidth.
|
||||
|
||||
#### Install RDMA Performance Tools
|
||||
|
||||
To get started, build the ROCm-optimized `rdma-perftest` test suite from
|
||||
source:
|
||||
|
||||
```bash
|
||||
sudo apt install -y libibumad-dev libpci-dev libibverbs-dev librdmacm-dev ibverbs-utils libtool
|
||||
git clone https://github.com/ROCm/rdma-perftest
|
||||
cd rdma-perftest/
|
||||
./autogen.sh
|
||||
./configure --enable-rocm --with-rocm=/opt/rocm
|
||||
make -j$(nproc)
|
||||
sudo make install
|
||||
```
|
||||
|
||||
#### Run a bandwidth test (GPU memory)
|
||||
|
||||
Perform a bandwidth test using ROCm GPU memory between two nodes. One acts
|
||||
as a server and the other acts as a client. For 400G interfaces, the expected
|
||||
peak throughput is approximately 390 Gbps. Replace `<SERVER_IP>` with the
|
||||
appropriate IP.
|
||||
|
||||
```bash
|
||||
# On Server Node
|
||||
./ib_write_bw --use_rocm=0 -d mlx5_0 --report_gbits -a
|
||||
|
||||
# On Client Node
|
||||
./ib_write_bw --use_rocm=0 -d mlx5_0 --report_gbits -a <SERVER_IP>
|
||||
```
|
||||
|
||||
## vLLM serving and MoRI unit tests
|
||||
|
||||
### Install Docker Engine
|
||||
|
||||
Install the Docker engine to manage the containerized vLLM and MoRI serving
|
||||
environments.
|
||||
|
||||
```bash
|
||||
sudo apt update && sudo apt install -y docker.io
|
||||
```
|
||||
|
||||
### Download the DeepSeek PTPC model
|
||||
|
||||
This guide uses the
|
||||
[DeepSeek-R1-FP8-Dynamic](https://huggingface.co/EmbeddedLLM/deepseek-r1-FP8-Dynamic)
|
||||
model optimized for PTPC. Use the following commands to install the Hugging
|
||||
Face CLI and download the model to your shared NFS directory:
|
||||
|
||||
```bash
|
||||
# Set up a virtual environment and install the Hugging Face CLI
|
||||
sudo apt update && sudo apt install -y python3-venv
|
||||
python3 -m venv ~/venvs/hf
|
||||
source ~/venvs/hf/bin/activate
|
||||
pip install huggingface_hub
|
||||
|
||||
# Download the model to the shared NFS mount point
|
||||
huggingface-cli download --token <your_hf_token> \
|
||||
EmbeddedLLM/deepseek-r1-FP8-Dynamic \
|
||||
--local-dir /mount/point/models/EmbeddedLLM/deepseek-r1-FP8-Dynamic
|
||||
```
|
||||
|
||||
### Launch the serving container
|
||||
|
||||
Deploy the vLLM MoRI serving Docker container on each node.
|
||||
|
||||
```bash
|
||||
CONTAINER_NAME=vllm_mori
|
||||
IMAGE_NAME=aigmkt/vllm:mori_rocm6.4.1_20251105
|
||||
|
||||
docker run -it \
|
||||
--rm \
|
||||
--device /dev/dri --device /dev/kfd --device=/dev/infiniBand \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v /mount/point/models:/models \
|
||||
--shm-size 128G \
|
||||
--name ${CONTAINER_NAME} \
|
||||
${IMAGE_NAME} /bin/bash
|
||||
```
|
||||
|
||||
### Run MoRI inter-node unit tests
|
||||
|
||||
Before starting the vLLM service, run the MoRI unit test to verify that the
|
||||
inter-node communication backend is correctly configured.
|
||||
|
||||
The key configuration variables are:
|
||||
|
||||
* `GLOO_SOCKET_IFNAME`: The network interface used for backend initialization such as `eth2`.
|
||||
* `<MASTER_IP>`: The IP address of the primary node's backend interface.
|
||||
|
||||
```{note}
|
||||
You can find reference performance data in the [ROCm/MoRI
|
||||
repository](https://github.com/ROCm/mori?tab=readme-ov-file#mori-ep).
|
||||
```
|
||||
|
||||
```bash
|
||||
# Set up environment inside the container
|
||||
cd /app/mori
|
||||
export PYTHONPATH=/app/mori:$PYTHONPATH
|
||||
export GLOO_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
|
||||
# Node 0 (Primary)
|
||||
torchrun --nnodes=2 --node_rank=0 --nproc_per_node=1 \
|
||||
--master_addr="<MASTER_IP>" --master_port=1234 \
|
||||
examples/ops/dispatch_combine/test_dispatch_combine_internode.py \
|
||||
--cmd bench --kernel-type v1
|
||||
|
||||
# Node 1 (Secondary)
|
||||
torchrun --nnodes=2 --node_rank=1 --nproc_per_node=1 \
|
||||
--master_addr="<MASTER_IP>" --master_port=1234 \
|
||||
examples/ops/dispatch_combine/test_dispatch_combine_internode.py \
|
||||
--cmd bench --kernel-type v1
|
||||
```
|
||||
|
||||
### Deploy and serve the model
|
||||
|
||||
To deploy DeepSeek-R1 (PTPC) with Expert Parallelism 16 (EP16) across two
|
||||
nodes, use the following serving scripts.
|
||||
|
||||
#### Create serving scripts
|
||||
|
||||
Create the following scripts inside the container on each node.
|
||||
|
||||
* Node 0 (master node): `ep16_node0.sh`
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
# Add VLLM_ENFORCE_EPLB=1 to enforce EP balance
|
||||
export VLLM_ROCM_USE_AITER=1
|
||||
export VLLM_ROCM_USE_AITER_MOE=1
|
||||
export VLLM_LOGGING_LEVEL=INFO
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ROCM_USE_AITER_MLA=1
|
||||
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0
|
||||
export VLLM_ALL2ALL_BACKEND=mori
|
||||
|
||||
vllm serve /models/EmbeddedLLM/deepseek-r1-FP8-Dynamic/ \
|
||||
-dp 16 \
|
||||
--enable-expert-parallel \
|
||||
--data-parallel-size-local 8 \
|
||||
--data-parallel-address ${IP} \
|
||||
--data-parallel-rpc-port 1212 \
|
||||
--served-model-name deepseek \
|
||||
--port 8777 \
|
||||
--block-size 1 \
|
||||
--distributed-executor-backend mp \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--max-model-len 8192 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--max-num-seqs 4096 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "custom_ops": ["+quant_fp8"]}' \
|
||||
--cuda-graph-sizes 1 2 4 8 16 32 64 128 256 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--no-enable-prefix-caching \
|
||||
--trust-remote-code 2>&1 | tee serving_node0_ep16.log
|
||||
```
|
||||
|
||||
* Node 1: `ep16_node1.sh`
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
# Add VLLM_ENFORCE_EPLB=1 to enforce EP balance
|
||||
export VLLM_ROCM_USE_AITER=1
|
||||
export VLLM_ROCM_USE_AITER_MOE=1
|
||||
export VLLM_LOGGING_LEVEL=INFO
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_ROCM_USE_AITER_MLA=1
|
||||
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0
|
||||
export VLLM_ALL2ALL_BACKEND=mori
|
||||
|
||||
vllm serve /models/EmbeddedLLM/deepseek-r1-FP8-Dynamic/ \
|
||||
-dp 16 \
|
||||
--enable-expert-parallel \
|
||||
--headless \
|
||||
--data-parallel-size-local 8 \
|
||||
--data-parallel-start-rank 8 \
|
||||
--data-parallel-address ${IP} \
|
||||
--data-parallel-rpc-port 1212 \
|
||||
--served-model-name deepseek \
|
||||
--port 8777 \
|
||||
--block-size 1 \
|
||||
--distributed-executor-backend mp \
|
||||
--gpu_memory_utilization 0.8 \
|
||||
--max-model-len 8192 \
|
||||
--max_num_batched_token 4096 \
|
||||
--max-num-seqs 4096 \
|
||||
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "custom_ops": ["+quant_fp8"]}' \
|
||||
--cuda-graph-sizes 1 2 4 8 16 32 64 128 256 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--no-enable-prefix-caching \
|
||||
--trust-remote-code 2>&1 | tee serving_node1_ep16.log
|
||||
```
|
||||
|
||||
#### Run the serving scripts
|
||||
|
||||
Run the scripts on each node to launch the distributed serving instance.
|
||||
Replace `<MASTER_IP>` with the backend network IP of Node 0.
|
||||
|
||||
```bash
|
||||
# On Node 0 (Primary)
|
||||
export NCCL_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
export GLOO_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
IP=<MASTER_IP> bash ep16_node0.sh
|
||||
|
||||
# On Node 1 (Secondary)
|
||||
export NCCL_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
export GLOO_SOCKET_IFNAME=<BACKEND_INTERFACE>
|
||||
IP=<MASTER_IP> bash ep16_node1.sh
|
||||
```
|
||||
|
||||
## Reproducing performance
|
||||
|
||||
This section details how to reproduce the performance metrics published in the
|
||||
AMD ROCm Blog: [Practical, Fault-Robust Distributed Inference for DeepSeek on
|
||||
AMD
|
||||
MI300X](https://rocm.blogs.amd.com/software-tools-optimization/wide-ep-deepseek/README.html).
|
||||
|
||||
### Configuration for EP16 (16 GPUs)
|
||||
|
||||
To achieve the reported throughput, expert parallelism 16 (EP16) is used across
|
||||
the decode nodes.
|
||||
|
||||
#### Benchmark target
|
||||
|
||||
* Decode throughput: ~12.4k output tokens/s per node.
|
||||
|
||||
### Performance reproduction commands
|
||||
|
||||
Use the following configurations to reproduce published performance metrics.
|
||||
|
||||
#### Decode benchmark
|
||||
|
||||
To reproduce the 12.4k output tokens/s, use the following configuration:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
|
||||
MAX_CONCURRENCY=${1:-3072}
|
||||
TIMES=2
|
||||
NUM_PROMPTS=$((MAX_CONCURRENCY*TIMES))
|
||||
vllm bench serve \
|
||||
--max-concurrency $MAX_CONCURRENCY \
|
||||
--num-prompts $NUM_PROMPTS \
|
||||
--model /models/EmbeddedLLM/deepseek-r1-FP8-Dynamic/ \
|
||||
--served-model-name deepseek \
|
||||
--port 8777 \
|
||||
--ignore-eos \
|
||||
--trust-remote-code \
|
||||
--dataset-name random \
|
||||
--seed 2025 \
|
||||
--random-input-len 2048 \
|
||||
--random-output-len 1024 2>&1 | tee bench_decode_${MAX_CONCURRENCY}_isl_2k_osl_1k.log
|
||||
```
|
||||
|
||||
To calculate the per-node throughput for comparison with the blog data, take
|
||||
the reported **Peak output token throughput (tok/s)** from the benchmark
|
||||
results and divide it by the total number of nodes in the cluster.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
The following section outlines common issues and their solutions.
|
||||
|
||||
### Bandwidth test fails with error
|
||||
|
||||
1. Use ROCm-optimized `rdma-perftest`, not the generic `perftest`.
|
||||
|
||||
``` bash
|
||||
which ib_write_bw
|
||||
```
|
||||
|
||||
2. Confirm the `SERVER_IP` is accessible.
|
||||
|
||||
``` bash
|
||||
ping <SERVER_IP>
|
||||
```
|
||||
|
||||
3. Check system logs, use `dmesg` for kernel-level errors.
|
||||
|
||||
``` bash
|
||||
sudo dmesg -T | grep -i 'error|warn|fail|exception'
|
||||
```
|
||||
|
||||
### vLLM EP 16 with MoRI backend fails to launch
|
||||
|
||||
1. Error: `Waiting for init message from front-end.` Check the connectivity of the `IP`. Disable firewall/selinux or allow traffic for port `1212`.
|
||||
|
||||
2. Verify server name resolution. Ensure server names are correctly mapped in `/etc/hosts`.
|
||||
|
||||
3. Confirm whether environment variable `GLOO_SOCKET_IFNAME` is set before running the vLLM serving script.
|
||||
@@ -26,6 +26,12 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
|
||||
- :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
|
||||
|
||||
- :doc:`vLLM distributed inference with MoRI <benchmark-docker/vllm-mori-distributed>`
|
||||
|
||||
- :doc:`SGLang distributed inference with MoRI <benchmark-docker/sglang-mori-distributed>`
|
||||
|
||||
- :doc:`SGLang distributed inference with Mooncake <benchmark-docker/sglang-distributed>`
|
||||
|
||||
- :doc:`xDiT diffusion inference <xdit-diffusion-inference>`
|
||||
|
||||
- :doc:`Deploying your model <deploy-your-model>`
|
||||
|
||||
@@ -31,16 +31,16 @@ in the Instinct documentation for more information.
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
amd-smi set --perf-determinism 1900
|
||||
|
||||
See `Hardware verfication for ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_
|
||||
in the Instinct documentation for more information.
|
||||
|
||||
@@ -108,16 +108,16 @@ for more information.
|
||||
Hardware verification with ROCm
|
||||
-------------------------------
|
||||
|
||||
Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
|
||||
Use the command ``amd-smi set --perf-determinism 1900`` to set the max clock speed up to 1900 MHz
|
||||
instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
|
||||
GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
|
||||
You can restore this setting to its default value with the ``rocm-smi -r`` command.
|
||||
You can restore this setting to its default value with the ``amd-smi reset --clocks`` command.
|
||||
|
||||
Run the command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocm-smi --setperfdeterminism 1900
|
||||
amd-smi set --perf-determinism 1900
|
||||
|
||||
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
||||
|
||||
@@ -248,7 +248,7 @@ Download the Docker image and required packages
|
||||
Checking out this specific commit is recommended for a stable and reproducible environment.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
|
||||
git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
|
||||
|
||||
Prepare training datasets
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
(artificial-intelligence-apis)=
|
||||
|
||||
* {doc}`Composable Kernel <composable_kernel:index>`
|
||||
* {doc}`hipDNN <hipdnn:index>`
|
||||
* {doc}`MIGraphX <amdmigraphx:index>`
|
||||
* {doc}`MIOpen <miopen:index>`
|
||||
* {doc}`MIVisionX <mivisionx:index>`
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
GPU hardware specifications
|
||||
===========================================
|
||||
|
||||
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, and AMD Radeon™ PRO and Radeon™ GPUs.
|
||||
The following tables provide an overview of the hardware specifications for AMD Instinct™ GPUs, AMD Radeon™ PRO and Radeon™ GPUs, and AMD Ryzen™ APUs.
|
||||
|
||||
For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
|
||||
|
||||
@@ -18,7 +18,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
:name: instinct-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Name
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
@@ -297,7 +297,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
:name: radeon-pro-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Name
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
|
||||
@@ -333,6 +333,24 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 12
|
||||
- 0
|
||||
*
|
||||
- Radeon AI PRO R9600D
|
||||
- RDNA4
|
||||
- gfx1201
|
||||
- 32
|
||||
- 48
|
||||
- 32 or 64
|
||||
- 128
|
||||
- 48
|
||||
- 8
|
||||
- N/A
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 768
|
||||
- 32
|
||||
- 12
|
||||
- 0
|
||||
*
|
||||
- Radeon PRO V710
|
||||
- RDNA3
|
||||
@@ -539,7 +557,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
:name: radeon-arch-spec-table
|
||||
|
||||
*
|
||||
- Model
|
||||
- Name
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
@@ -610,6 +628,24 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 12
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 9060 XT LP
|
||||
- RDNA4
|
||||
- gfx1200
|
||||
- 16
|
||||
- 32
|
||||
- 32 or 64
|
||||
- 128
|
||||
- 32
|
||||
- 4
|
||||
- N/A
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 768
|
||||
- 32
|
||||
- 12
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 9060 XT
|
||||
- RDNA4
|
||||
@@ -718,6 +754,24 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 32
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7700
|
||||
- RDNA3
|
||||
- gfx1101
|
||||
- 16
|
||||
- 40
|
||||
- 32 or 64
|
||||
- 128
|
||||
- 64
|
||||
- 4
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 768
|
||||
- 32
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- Radeon RX 7700 XT
|
||||
- RDNA3
|
||||
@@ -953,6 +1007,127 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
||||
- 9
|
||||
- 0
|
||||
|
||||
.. tab-item:: AMD Ryzen APUs
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:name: ryzen-arch-spec-table
|
||||
|
||||
*
|
||||
- Name
|
||||
- Graphics model
|
||||
- Architecture
|
||||
- LLVM target name
|
||||
- VRAM (GiB)
|
||||
- Compute Units
|
||||
- Wavefront Size
|
||||
- LDS (KiB)
|
||||
- Infinity Cache (MiB)
|
||||
- L2 Cache (MiB)
|
||||
- Graphics L1 Cache (KiB)
|
||||
- L0 Vector Cache (KiB)
|
||||
- L0 Scalar Cache (KiB)
|
||||
- L0 Instruction Cache (KiB)
|
||||
- VGPR File (KiB)
|
||||
- SGPR File (KiB)
|
||||
- GFXIP Major version
|
||||
- GFXIP Minor version
|
||||
*
|
||||
- AMD Ryzen 7 7840U
|
||||
- Radeon 780M
|
||||
- RDNA3
|
||||
- gfx1103
|
||||
- Dynamic + carveout
|
||||
- 12
|
||||
- 32 or 64
|
||||
- 128
|
||||
- N/A
|
||||
- 2
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 512
|
||||
- 32
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- AMD Ryzen 9 270
|
||||
- Radeon 780M
|
||||
- RDNA3
|
||||
- gfx1103
|
||||
- Dynamic + carveout
|
||||
- 12
|
||||
- 32 or 64
|
||||
- 128
|
||||
- N/A
|
||||
- 2
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 512
|
||||
- 32
|
||||
- 11
|
||||
- 0
|
||||
*
|
||||
- AMD Ryzen AI 9 HX 375
|
||||
- Radeon 890M
|
||||
- RDNA3.5
|
||||
- gfx1150
|
||||
- Dynamic + carveout
|
||||
- 16
|
||||
- 32 or 64
|
||||
- 128
|
||||
- N/A
|
||||
- 2
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 512
|
||||
- 32
|
||||
- 11
|
||||
- 5
|
||||
*
|
||||
- AMD Ryzen AI Max+ PRO 395
|
||||
- Radeon 8060S
|
||||
- RDNA3.5
|
||||
- gfx1151
|
||||
- Dynamic + carveout
|
||||
- 40
|
||||
- 32 or 64
|
||||
- 128
|
||||
- 32
|
||||
- 2
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 768
|
||||
- 32
|
||||
- 11
|
||||
- 5
|
||||
*
|
||||
- AMD Ryzen Al 7 350
|
||||
- Radeon 860M
|
||||
- RDNA3.5
|
||||
- gfx1152
|
||||
- Dynamic + carveout
|
||||
- 8
|
||||
- 32 or 64
|
||||
- 128
|
||||
- N/A
|
||||
- 1
|
||||
- 256
|
||||
- 32
|
||||
- 16
|
||||
- 32
|
||||
- 512
|
||||
- 32
|
||||
- 11
|
||||
- 5
|
||||
|
||||
Glossary
|
||||
========
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
| Version | Release date |
|
||||
| ------- | ------------ |
|
||||
| [7.2.0](https://rocm.docs.amd.com/en/docs-7.2.0/) | January 21, 2026 |
|
||||
| [7.1.1](https://rocm.docs.amd.com/en/docs-7.1.1/) | November 26, 2025 |
|
||||
| [7.1.0](https://rocm.docs.amd.com/en/docs-7.1.0/) | October 30, 2025 |
|
||||
| [7.0.2](https://rocm.docs.amd.com/en/docs-7.0.2/) | October 10, 2025 |
|
||||
|
||||
@@ -25,7 +25,7 @@ subtrees:
|
||||
title: HIP SDK on Windows
|
||||
- url: https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/index.html
|
||||
title: ROCm on Radeon and Ryzen
|
||||
- file: how-to/deep-learning-rocm.md
|
||||
- file: how-to/deep-learning-rocm
|
||||
title: Deep learning frameworks
|
||||
subtrees:
|
||||
- entries:
|
||||
@@ -119,6 +119,10 @@ subtrees:
|
||||
title: PyTorch inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
|
||||
title: SGLang inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/vllm-mori-distributed.md
|
||||
title: vLLM distributed inference with MoRI
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-mori-distributed.md
|
||||
title: SGLang distributed inference with MoRI
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
||||
title: SGLang distributed inference with Mooncake
|
||||
- file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.31.2
|
||||
rocm-docs-core==1.31.3
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -188,7 +188,7 @@ requests==2.32.5
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.31.2
|
||||
rocm-docs-core==1.31.3
|
||||
# via -r requirements.in
|
||||
rpds-py==0.30.0
|
||||
# via
|
||||
@@ -214,6 +214,7 @@ sphinx==8.1.3
|
||||
# sphinx-design
|
||||
# sphinx-external-toc
|
||||
# sphinx-last-updated-by-git
|
||||
# sphinx-multitoc-numbering
|
||||
# sphinx-notfound-page
|
||||
# sphinx-reredirects
|
||||
# sphinxcontrib-datatemplates
|
||||
@@ -224,10 +225,12 @@ sphinx-copybutton==0.5.2
|
||||
# via rocm-docs-core
|
||||
sphinx-design==0.6.1
|
||||
# via rocm-docs-core
|
||||
sphinx-external-toc==1.0.1
|
||||
sphinx-external-toc==1.1.0
|
||||
# via rocm-docs-core
|
||||
sphinx-last-updated-by-git==0.3.8
|
||||
# via sphinx-sitemap
|
||||
sphinx-multitoc-numbering==0.1.3
|
||||
# via sphinx-external-toc
|
||||
sphinx-notfound-page==1.1.0
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.6
|
||||
|
||||
@@ -36,6 +36,7 @@ Machine Learning & Computer Vision
|
||||
:header: "Component", "Description"
|
||||
|
||||
":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
|
||||
":doc:`hipDNN <hipdnn:index>`", "A graph-based deep learning library that enables multi-operation fusion for improved performance on AMD GPUs. "
|
||||
":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
|
||||
":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
|
||||
":doc:`MIVisionX <mivisionx:index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
|
||||
|
||||
44
tools/rocm-build/rocm-7.2.0.xml
Normal file
44
tools/rocm-build/rocm-7.2.0.xml
Normal file
@@ -0,0 +1,44 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.2.0"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIPIFY" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
Reference in New Issue
Block a user