Compare commits
9 Commits
amd/hsivas
...
roc-6.0.x
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b597ecf20b | ||
|
|
f47a9d26c7 | ||
|
|
5ca1150c3d | ||
|
|
35231ec9b9 | ||
|
|
444896fb65 | ||
|
|
995135acf7 | ||
|
|
acf04f2969 | ||
|
|
893ee4a56b | ||
|
|
252dae76a7 |
11
.github/CODEOWNERS
vendored
@@ -1,8 +1,5 @@
|
||||
* @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
|
||||
* @saadrahim @Rmalavally @amd-aakash @zhang2amd @jlgreathouse @samjwu @MathiasMagnus @LisaDelaney
|
||||
# Documentation files
|
||||
docs/ @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
|
||||
*.md @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
|
||||
*.rst @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
|
||||
# External CI
|
||||
/.azuredevops/ @ROCm/external-ci
|
||||
tools/rocm-build/ @ROCm/rocm-devops
|
||||
docs/* @ROCm/rocm-documentation
|
||||
*.md @ROCm/rocm-documentation
|
||||
*.rst @ROCm/rocm-documentation
|
||||
|
||||
4
.github/workflows/issue_retrieval.yml
vendored
@@ -2,7 +2,7 @@ name: Issue retrieval
|
||||
|
||||
on:
|
||||
issues:
|
||||
types: [opened, edited]
|
||||
types: [opened]
|
||||
|
||||
jobs:
|
||||
auto-retrieve:
|
||||
@@ -15,7 +15,7 @@ jobs:
|
||||
app_id: ${{ secrets.ACTION_APP_ID }}
|
||||
private_key: ${{ secrets.ACTION_PEM }}
|
||||
- name: 'Retrieve Issue'
|
||||
uses: harkgill-amd/rocm_issue_management@main
|
||||
uses: abhimeda/rocm_issue_management@main
|
||||
with:
|
||||
authentication-token: ${{ steps.generate_token.outputs.token }}
|
||||
github-organization: 'ROCm'
|
||||
|
||||
4
.github/workflows/linting.yml
vendored
@@ -2,13 +2,13 @@ name: Linting
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
branches:
|
||||
- develop
|
||||
- main
|
||||
- 'docs/*'
|
||||
- 'roc**'
|
||||
pull_request:
|
||||
branches:
|
||||
branches:
|
||||
- develop
|
||||
- main
|
||||
- 'docs/*'
|
||||
|
||||
6
.gitignore
vendored
@@ -1,21 +1,19 @@
|
||||
.venv
|
||||
.vscode
|
||||
build
|
||||
__pycache__
|
||||
|
||||
# documentation artifacts
|
||||
_build/
|
||||
_images/
|
||||
__pycache__/
|
||||
_static/
|
||||
_templates/
|
||||
_toc.yml
|
||||
docBin/
|
||||
_doxygen/
|
||||
_readthedocs/
|
||||
__pycache__/
|
||||
|
||||
# avoid duplicating contributing.md due to conf.py
|
||||
docs/CHANGELOG.md
|
||||
docs/contribute/index.md
|
||||
docs/about/release-notes.md
|
||||
docs/release/changelog.md
|
||||
docs/about/CHANGELOG.md
|
||||
@@ -12,5 +12,6 @@ config:
|
||||
MD041: false
|
||||
MD051: false
|
||||
ignores:
|
||||
- "{,docs/}{RELEASE,release,CHANGELOG,changelog}.md"
|
||||
- CHANGELOG.md
|
||||
- "{,docs/}{RELEASE,release}.md"
|
||||
- tools/autotag/templates/**/*.md
|
||||
|
||||
@@ -6,17 +6,13 @@ version: 2
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
|
||||
formats: [htmlzip]
|
||||
formats: [htmlzip, pdf]
|
||||
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/sphinx/requirements.txt
|
||||
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
os: ubuntu-20.04
|
||||
tools:
|
||||
python: "3.10"
|
||||
apt_packages:
|
||||
- "doxygen"
|
||||
- "gfortran" # For pre-processing fortran sources
|
||||
- "graphviz" # For dot graphs in doxygen
|
||||
python: "3.8"
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
matrix:
|
||||
- name: Markdown
|
||||
sources:
|
||||
- ['tools/autotag/templates/**/*.md', '!tools/autotag/templates/**/5*.md', '!tools/autotag/templates/**/6.0*.md', '!tools/autotag/templates/**/6.1*.md']
|
||||
- name: reST
|
||||
sources:
|
||||
- []
|
||||
- name: Cpp
|
||||
sources:
|
||||
- []
|
||||
432
.wordlist.txt
15001
CHANGELOG.md
@@ -66,10 +66,11 @@ project-specific steps. Refer to each repository's PR process for any additional
|
||||
during our release cycle, as coordinated by the maintainer
|
||||
* We'll inform you once your change is committed
|
||||
|
||||
> [!IMPORTANT]
|
||||
> By creating a PR, you agree to allow your contribution to be licensed under the
|
||||
> terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
|
||||
> licenses.
|
||||
:::{important}
|
||||
By creating a PR, you agree to allow your contribution to be licensed under the
|
||||
terms of the LICENSE.txt file in the corresponding repository. Different repositories may use different
|
||||
licenses.
|
||||
:::
|
||||
|
||||
You can look up each license on the [ROCm licensing](https://rocm.docs.amd.com/en/latest/about/license.html) page.
|
||||
|
||||
|
||||
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 - 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
32
README.md
@@ -19,15 +19,6 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
|
||||
source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
|
||||
(ML) frameworks, such as PyTorch and TensorFlow.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> A new open source build platform for ROCm is under development at
|
||||
> https://github.com/ROCm/TheRock, featuring a unified CMake build with bundled
|
||||
> dependencies, Windows support, and more.
|
||||
|
||||
## Getting and Building ROCm from Source
|
||||
|
||||
Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.
|
||||
|
||||
## ROCm documentation
|
||||
|
||||
This repository contains the [manifest file](https://gerrit.googlesource.com/git-repo/+/HEAD/docs/manifest-format.md)
|
||||
@@ -41,9 +32,28 @@ Source code for our documentation is located in the `/docs` folder of most ROCm
|
||||
|
||||
The ROCm documentation homepage is [rocm.docs.amd.com](https://rocm.docs.amd.com).
|
||||
|
||||
For information on how to contribute to the ROCm documentation, see [Contributing to the ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
|
||||
### Building our documentation
|
||||
|
||||
For a quick-start build, use the following code. For more options and detail, refer to
|
||||
[Building documentation](./docs/contribute/building.md).
|
||||
|
||||
```bash
|
||||
cd docs
|
||||
|
||||
pip3 install -r sphinx/requirements.txt
|
||||
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
Alternatively, CMake build is supported.
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
|
||||
cmake --build build --target=doc
|
||||
```
|
||||
|
||||
## Older ROCm releases
|
||||
|
||||
For release information for older ROCm releases, refer to the
|
||||
[ROCm release history](https://rocm.docs.amd.com/en/latest/release/versions.html).
|
||||
[CHANGELOG](./CHANGELOG.md).
|
||||
|
||||
1535
RELEASE.md
72
default.xml
@@ -1,44 +1,72 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.2.0"
|
||||
<remote name="KhronosGroup" fetch="https://github.com/KhronosGroup/" />
|
||||
<default revision="refs/tags/rocm-6.0.2"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="ROCT-Thunk-Interface" />
|
||||
<project name="ROCR-Runtime" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_smi_lib" />
|
||||
<project name="rocm-core" />
|
||||
<project name="rocm-cmake" />
|
||||
<project name="rocminfo" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<project name="rocprofiler" />
|
||||
<project name="roctracer" />
|
||||
<project path="ROCm-OpenCL-Runtime/api/opencl/khronos/icd" name="OpenCL-ICD-Loader" remote="KhronosGroup" revision="6c03f8b58fafd9dd693eaac826749a5cfad515f8" />
|
||||
<project name="clang-ocl" />
|
||||
<project name="rdc" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIP" />
|
||||
<project name="HIP-Examples" />
|
||||
<project name="clr" />
|
||||
<project name="hipother" />
|
||||
<project name="HIPIFY" />
|
||||
<project name="HIPCC" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<project name="ROCm-Device-Libs" />
|
||||
<project name="ROCm-CompilerSupport" />
|
||||
<project name="half" revision="37742ce15b76b44e4b271c1e66d13d2fa7bd003e" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocBLAS" />
|
||||
<project groups="mathlibs" name="Tensile" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipBLAS" />
|
||||
<project groups="mathlibs" name="hipBLASLt" />
|
||||
<project groups="mathlibs" name="rocFFT" />
|
||||
<project groups="mathlibs" name="hipFFT" />
|
||||
<project groups="mathlibs" name="rocRAND" />
|
||||
<project groups="mathlibs" name="hipRAND" />
|
||||
<project groups="mathlibs" name="rocSPARSE" />
|
||||
<project groups="mathlibs" name="hipSPARSELt" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipSPARSE" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<project groups="mathlibs" name="rocThrust" />
|
||||
<project groups="mathlibs" name="hipCUB" />
|
||||
<project groups="mathlibs" name="rocPRIM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project name="MIOpen" />
|
||||
<project name="composable_kernel" />
|
||||
<project name="MIVisionX" />
|
||||
<project name="rpp" />
|
||||
<project name="hipfort" />
|
||||
<project name="AMDMIGraphX" />
|
||||
<project name="ROCmValidationSuite" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
</manifest>
|
||||
|
||||
564
docs/about/compatibility/data-type-support.rst
Normal file
@@ -0,0 +1,564 @@
|
||||
.. meta::
|
||||
:description: Supported data types in ROCm
|
||||
:keywords: int8, float8, float8 (E4M3), float8 (E5M2), bfloat8, float16, half, bfloat16, tensorfloat32, float, float32, float64, double, AMD, ROCm, AMDGPU
|
||||
|
||||
.. _rocm-supported-data-types:
|
||||
|
||||
*************************************************************
|
||||
ROCm data type specifications
|
||||
*************************************************************
|
||||
|
||||
Integral types
|
||||
==========================================
|
||||
|
||||
The signed and unsigned integral types that are supported by ROCm™ are listed in the following table,
|
||||
together with their corresponding HIP type and a short description.
|
||||
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 15,35,50
|
||||
|
||||
*
|
||||
- Type name
|
||||
- HIP type
|
||||
- Description
|
||||
*
|
||||
- int8
|
||||
- ``int8_t``, ``uint8_t``
|
||||
- A signed or unsigned 8-bit integer
|
||||
*
|
||||
- int16
|
||||
- ``int16_t``, ``uint16_t``
|
||||
- A signed or unsigned 16-bit integer
|
||||
*
|
||||
- int32
|
||||
- ``int32_t``, ``uint32_t``
|
||||
- A signed or unsigned 32-bit integer
|
||||
*
|
||||
- int64
|
||||
- ``int64_t``, ``uint64_t``
|
||||
- A signed or unsigned 64-bit integer
|
||||
|
||||
Floating-point types
|
||||
==========================================
|
||||
|
||||
The floating-point types that are supported by ROCm are listed in the following table, together with
|
||||
their corresponding HIP type and a short description.
|
||||
|
||||
.. image:: ../../data/about/compatibility/floating-point-data-types.png
|
||||
:alt: Supported floating-point types
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 15,15,70
|
||||
|
||||
*
|
||||
- Type name
|
||||
- HIP type
|
||||
- Description
|
||||
*
|
||||
- float8 (E4M3)
|
||||
- ``-``
|
||||
- An 8-bit floating-point number that mostly follows IEEE-754 conventions and **S1E4M3** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ , with expanded range and with no infinity or signed zero. NaN is represented as negative zero.
|
||||
*
|
||||
- float8 (E5M2)
|
||||
- ``-``
|
||||
- An 8-bit floating-point number mostly following IEEE-754 conventions and **S1E5M2** bit layout, as described in `8-bit Numerical Formats for Deep Neural Networks <https://arxiv.org/abs/2206.02915>`_ , with expanded range and with no infinity or signed zero. NaN is represented as negative zero.
|
||||
*
|
||||
- float16
|
||||
- ``half``
|
||||
- A 16-bit floating-point number that conforms to the IEEE 754-2008 half-precision storage format.
|
||||
*
|
||||
- bfloat16
|
||||
- ``bfloat16``
|
||||
- A shortened 16-bit version of the IEEE 754 single-precision storage format.
|
||||
*
|
||||
- tensorfloat32
|
||||
- ``-``
|
||||
- A floating-point number that occupies 32 bits or less of storage, providing improved range compared to half (16-bit) format, at (potentially) greater throughput than single-precision (32-bit) formats.
|
||||
*
|
||||
- float32
|
||||
- ``float``
|
||||
- A 32-bit floating-point number that conforms to the IEEE 754 single-precision storage format.
|
||||
*
|
||||
- float64
|
||||
- ``double``
|
||||
- A 64-bit floating-point number that conforms to the IEEE 754 double-precision storage format.
|
||||
|
||||
.. note::
|
||||
|
||||
* The float8 and tensorfloat32 types are internal types used in calculations in Matrix Cores and can be stored in any type of the same size.
|
||||
* The encodings for FP8 (E5M2) and FP8 (E4M3) that are natively supported by MI300 differ from the FP8 (E5M2) and FP8 (E4M3) encodings used in H100 (`FP8 Formats for Deep Learning <https://arxiv.org/abs/2209.05433>`_).
|
||||
* In some AMD documents and articles, float8 (E5M2) is referred to as bfloat8.
|
||||
|
||||
ROCm support icons
|
||||
==========================================
|
||||
|
||||
In the following sections, we use icons to represent the level of support. These icons, described in the
|
||||
following table, are also used on the library data type support pages.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Icon
|
||||
- Definition
|
||||
*
|
||||
- ❌
|
||||
- Not supported
|
||||
|
||||
*
|
||||
- ⚠️
|
||||
- Partial support
|
||||
|
||||
*
|
||||
- ✅
|
||||
- Full support
|
||||
|
||||
.. note::
|
||||
|
||||
* Full support means that the type is supported natively or with hardware emulation.
|
||||
* Native support means that the operations for that type are implemented in hardware. Types that are not natively supported are emulated with the available hardware. The performance of non-natively supported types can differ from the full instruction throughput rate. For example, 16-bit integer operations can be performed on the 32-bit integer ALUs at full rate; however, 64-bit integer operations might need several instructions on the 32-bit integer ALUs.
|
||||
* Any type can be emulated by software, but this page does not cover such cases.
|
||||
|
||||
Hardware type support
|
||||
==========================================
|
||||
|
||||
AMD GPU hardware support for data types is listed in the following tables.
|
||||
|
||||
Compute units support
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
The following table lists data type support for compute units.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Integral types
|
||||
:sync: integral-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- int8
|
||||
- int16
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- MI100
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI200 series
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI300 series
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
|
||||
.. tab-item:: Floating-point types
|
||||
:sync: floating-point-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- float8 (E4M3)
|
||||
- float8 (E5M2)
|
||||
- float16
|
||||
- bfloat16
|
||||
- tensorfloat32
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- MI100
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI200 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI300 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
|
||||
Matrix core support
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
The following table lists data type support for AMD GPU matrix cores.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Integral types
|
||||
:sync: integral-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- int8
|
||||
- int16
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- MI100
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
*
|
||||
- MI200 series
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
*
|
||||
- MI300 series
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
|
||||
.. tab-item:: Floating-point types
|
||||
:sync: floating-point-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- float8 (E4M3)
|
||||
- float8 (E5M2)
|
||||
- float16
|
||||
- bfloat16
|
||||
- tensorfloat32
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- MI100
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
*
|
||||
- MI200 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI300 series
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
|
||||
Atomic operations support
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
The following table lists data type support for atomic operations.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Integral types
|
||||
:sync: integral-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- int8
|
||||
- int16
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- MI100
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
*
|
||||
- MI200 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI300 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
|
||||
.. tab-item:: Floating-point types
|
||||
:sync: floating-point-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Type name
|
||||
- float8 (E4M3)
|
||||
- float8 (E5M2)
|
||||
- float16
|
||||
- bfloat16
|
||||
- tensorfloat32
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- MI100
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
*
|
||||
- MI200 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
*
|
||||
- MI300 series
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ✅
|
||||
|
||||
.. note::
|
||||
|
||||
For cases that are not natively supported, you can emulate atomic operations using software.
|
||||
Software-emulated atomic operations have high negative performance impact when they frequently
|
||||
access the same memory address.
|
||||
|
||||
Data Type support in ROCm Libraries
|
||||
==========================================
|
||||
|
||||
ROCm library support for int8, float8 (E4M3), float8 (E5M2), int16, float16, bfloat16, int32,
|
||||
tensorfloat32, float32, int64, and float64 is listed in the following tables.
|
||||
|
||||
Libraries input/output type support
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
The following tables list ROCm library support for specific input and output data types. For a detailed
|
||||
description, refer to the corresponding library data type support page.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Integral types
|
||||
:sync: integral-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Library input/output data type name
|
||||
- int8
|
||||
- int16
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details<rocrand:data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- hipRAND (:doc:`details<hiprand:data-type-support>`)
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details<rocprim:data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details<hipcub:data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details<rocthrust:data-type-support>`)
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
|
||||
.. tab-item:: Floating-point types
|
||||
:sync: floating-point-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Library input/output data type name
|
||||
- float8 (E4M3)
|
||||
- float8 (E5M2)
|
||||
- float16
|
||||
- bfloat16
|
||||
- tensorfloat32
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
*
|
||||
- rocRAND (:doc:`details<rocrand:data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- hipRAND (:doc:`details<hiprand:data-type-support>`)
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/❌
|
||||
- -/❌
|
||||
- -/✅
|
||||
- -/✅
|
||||
*
|
||||
- rocPRIM (:doc:`details<rocprim:data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- hipCUB (:doc:`details<hipcub:data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
*
|
||||
- rocThrust (:doc:`details<rocthrust:data-type-support>`)
|
||||
- ❌/❌
|
||||
- ❌/❌
|
||||
- ⚠️/⚠️
|
||||
- ⚠️/⚠️
|
||||
- ❌/❌
|
||||
- ✅/✅
|
||||
- ✅/✅
|
||||
|
||||
|
||||
Libraries internal calculations type support
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
The following tables list ROCm library support for specific internal data types. For a detailed
|
||||
description, refer to the corresponding library data type support page.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Integral types
|
||||
:sync: integral-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Library internal data type name
|
||||
- int8
|
||||
- int16
|
||||
- int32
|
||||
- int64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
|
||||
|
||||
.. tab-item:: Floating-point types
|
||||
:sync: floating-point-type
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
*
|
||||
- Library internal data type name
|
||||
- float8 (E4M3)
|
||||
- float8 (E5M2)
|
||||
- float16
|
||||
- bfloat16
|
||||
- tensorfloat32
|
||||
- float32
|
||||
- float64
|
||||
*
|
||||
- hipSPARSELt (:doc:`details<hipsparselt:reference/data-type-support>`)
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
- ❌
|
||||
- ✅
|
||||
- ❌
|
||||
483
docs/about/compatibility/openmp.md
Normal file
@@ -0,0 +1,483 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="OpenMP support in ROCm">
|
||||
<meta name="keywords" content="OpenMP, LLVM, OpenMP toolchain">
|
||||
</head>
|
||||
|
||||
# OpenMP support in ROCm
|
||||
|
||||
## Introduction
|
||||
|
||||
The ROCm™ installation includes an LLVM-based implementation that fully supports
|
||||
the OpenMP 4.5 standard and a subset of OpenMP 5.0, 5.1, and 5.2 standards.
|
||||
Fortran, C/C++ compilers, and corresponding runtime libraries are included.
|
||||
Along with host APIs, the OpenMP compilers support offloading code and data onto
|
||||
GPU devices. This document briefly describes the installation location of the
|
||||
OpenMP toolchain, example usage of device offloading, and usage of `rocprof`
|
||||
with OpenMP applications. The GPUs supported are the same as those supported by
|
||||
this ROCm release. See the list of supported GPUs for {doc}`Linux<rocm-install-on-linux:reference/system-requirements>` and
|
||||
{doc}`Windows<rocm-install-on-windows:reference/system-requirements>`.
|
||||
|
||||
The ROCm OpenMP compiler is implemented using LLVM compiler technology.
|
||||
The following image illustrates the internal steps taken to translate a user’s application into an executable that can offload computation to the AMDGPU. The compilation is a two-pass process. Pass 1 compiles the application to generate the CPU code and Pass 2 links the CPU code to the AMDGPU device code.
|
||||
|
||||

|
||||
|
||||
### Installation
|
||||
|
||||
The OpenMP toolchain is automatically installed as part of the standard ROCm
|
||||
installation and is available under `/opt/rocm-{version}/llvm`. The
|
||||
sub-directories are:
|
||||
|
||||
* bin: Compilers (`flang` and `clang`) and other binaries.
|
||||
* examples: The usage section below shows how to compile and run these programs.
|
||||
* include: Header files.
|
||||
* lib: Libraries including those required for target offload.
|
||||
* lib-debug: Debug versions of the above libraries.
|
||||
|
||||
## OpenMP: usage
|
||||
|
||||
The example programs can be compiled and run by pointing the environment
|
||||
variable `ROCM_PATH` to the ROCm install directory.
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
export ROCM_PATH=/opt/rocm-{version}
|
||||
cd $ROCM_PATH/share/openmp-extras/examples/openmp/veccopy
|
||||
sudo make run
|
||||
```
|
||||
|
||||
:::{note}
|
||||
`sudo` is required since we are building inside the `/opt` directory.
|
||||
Alternatively, copy the files to your home directory first.
|
||||
:::
|
||||
|
||||
The above invocation of Make compiles and runs the program. Note the options
|
||||
that are required for target offload from an OpenMP program:
|
||||
|
||||
```bash
|
||||
-fopenmp --offload-arch=<gpu-arch>
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The compiler also accepts the alternative offloading notation:
|
||||
|
||||
```bash
|
||||
-fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=<gpu-arch>
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
Obtain the value of `gpu-arch` by running the following command:
|
||||
|
||||
```bash
|
||||
% /opt/rocm-{version}/bin/rocminfo | grep gfx
|
||||
```
|
||||
|
||||
[//]: # (dated link below, needs updating)
|
||||
|
||||
See the complete list of compiler command-line references
|
||||
[here](https://github.com/ROCm/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst).
|
||||
|
||||
### Using `rocprof` with OpenMP
|
||||
|
||||
The following steps describe a typical workflow for using `rocprof` with OpenMP
|
||||
code compiled with AOMP:
|
||||
|
||||
1. Run `rocprof` with the program command line:
|
||||
|
||||
```bash
|
||||
% rocprof <application> <args>
|
||||
```
|
||||
|
||||
This produces a `results.csv` file in the user’s current directory that
|
||||
shows basic stats such as kernel names, grid size, number of registers used,
|
||||
etc. The user can choose to specify the preferred output file name using the
|
||||
o option.
|
||||
|
||||
2. Add options for a detailed result:
|
||||
|
||||
```bash
|
||||
--stats: % rocprof --stats <application> <args>
|
||||
```
|
||||
|
||||
The stats option produces timestamps for the kernels. Look into the output
|
||||
CSV file for the field, `DurationNs`, which is useful in getting an
|
||||
understanding of the critical kernels in the code.
|
||||
|
||||
Apart from `--stats`, the option `--timestamp` on produces a timestamp for
|
||||
the kernels.
|
||||
|
||||
3. After learning about the required kernels, the user can take a detailed look
|
||||
at each one of them. `rocprof` has support for hardware counters: a set of
|
||||
basic and a set of derived ones. See the complete list of counters using
|
||||
options --list-basic and --list-derived. `rocprof` accepts either a text or
|
||||
an XML file as an input.
|
||||
|
||||
For more details on `rocprof`, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.
|
||||
|
||||
### Using tracing options
|
||||
|
||||
**Prerequisite:** When using the `--sys-trace` option, compile the OpenMP
|
||||
program with:
|
||||
|
||||
```bash
|
||||
-Wl,-rpath,/opt/rocm-{version}/lib -lamdhip64
|
||||
```
|
||||
|
||||
The following tracing options are widely used to generate useful information:
|
||||
|
||||
* **`--hsa-trace`**: This option is used to get a JSON output file with the HSA
|
||||
API execution traces and a flat profile in a CSV file.
|
||||
|
||||
* **`--sys-trace`**: This allows programmers to trace both HIP and HSA calls.
|
||||
Since this option results in loading ``libamdhip64.so``, follow the
|
||||
prerequisite as mentioned above.
|
||||
|
||||
A CSV and a JSON file are produced by the above trace options. The CSV file
|
||||
presents the data in a tabular format, and the JSON file can be visualized using
|
||||
Google Chrome at chrome://tracing/ or [Perfetto](https://perfetto.dev/).
|
||||
Navigate to Chrome or Perfetto and load the JSON file to see the timeline of the
|
||||
HSA calls.
|
||||
|
||||
For more details on tracing, refer to the {doc}`ROCProfilerV1 User Manual <rocprofiler:rocprofv1>`.
|
||||
|
||||
### Environment variables
|
||||
|
||||
:::{table}
|
||||
:widths: auto
|
||||
| Environment Variable | Purpose |
|
||||
| --------------------------- | ---------------------------- |
|
||||
| `OMP_NUM_TEAMS` | To set the number of teams for kernel launch, which is otherwise chosen by the implementation by default. You can set this number (subject to implementation limits) for performance tuning. |
|
||||
| `LIBOMPTARGET_KERNEL_TRACE` | To print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. |
|
||||
| `LIBOMPTARGET_INFO` | To print informational messages from the device runtime as the program executes. Setting it to a value of 1 or higher, prints fine-grain information and setting it to -1 prints complete information. |
|
||||
| `LIBOMPTARGET_DEBUG` | To get detailed debugging information about data transfer operations and kernel launch when using a debug version of the device library. Set this environment variable to 1 to get the detailed information from the library. |
|
||||
| `GPU_MAX_HW_QUEUES` | To set the number of HSA queues in the OpenMP runtime. The HSA queues are created on demand up to the maximum value as supplied here. The queue creation starts with a single initialized queue to avoid unnecessary allocation of resources. The provided value is capped if it exceeds the recommended, device-specific value. |
|
||||
| `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` | To set the threshold size up to which data transfers are initiated asynchronously. The default threshold size is 1*1024*1024 bytes (1MB). |
|
||||
| `OMPX_FORCE_SYNC_REGIONS` | To force the runtime to execute all operations synchronously, i.e., wait for an operation to complete immediately. This affects data transfers and kernel execution. While it is mainly designed for debugging, it may have a minor positive effect on performance in certain situations. |
|
||||
:::
|
||||
|
||||
## OpenMP: features
|
||||
|
||||
The OpenMP programming model is greatly enhanced with the following new features
|
||||
implemented in the past releases.
|
||||
|
||||
(openmp_usm)=
|
||||
|
||||
### Asynchronous behavior in OpenMP target regions
|
||||
|
||||
* Controlling Asynchronous Behavior
|
||||
|
||||
The OpenMP offloading runtime executes in an asynchronous fashion by default, allowing multiple data transfers to start concurrently. However, if the data to be transferred becomes larger than the default threshold of 1MB, the runtime falls back to a synchronous data transfer. The buffers that have been locked already are always executed asynchronously.
|
||||
You can overrule this default behavior by setting `LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES` and `OMPX_FORCE_SYNC_REGIONS`. See the [Environment Variables](#environment-variables) table for details.
|
||||
|
||||
* Multithreaded Offloading on the Same Device
|
||||
|
||||
The `libomptarget` plugin for GPU offloading allows creation of separate configurable HSA queues per chiplet, which enables two or more threads to concurrently offload to the same device.
|
||||
|
||||
* Parallel Memory Copy Invocations
|
||||
|
||||
Implicit asynchronous execution of single target region enables parallel memory copy invocations.
|
||||
|
||||
### Unified shared memory
|
||||
|
||||
Unified Shared Memory (USM) provides a pointer-based approach to memory
|
||||
management. To implement USM, fulfill the following system requirements along
|
||||
with Xnack capability.
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
* Linux Kernel versions above 5.14
|
||||
* Latest KFD driver packaged in ROCm stack
|
||||
* Xnack, as USM support can only be tested with applications compiled with Xnack
|
||||
capability
|
||||
|
||||
#### Xnack capability
|
||||
|
||||
When enabled, Xnack capability allows GPU threads to access CPU (system) memory,
|
||||
allocated with OS-allocators, such as `malloc`, `new`, and `mmap`. Xnack must be
|
||||
enabled both at compile- and run-time. To enable Xnack support at compile-time,
|
||||
use:
|
||||
|
||||
```bash
|
||||
--offload-arch=gfx908:xnack+
|
||||
```
|
||||
|
||||
Or use another functionally equivalent option Xnack-any:
|
||||
|
||||
```bash
|
||||
--offload-arch=gfx908
|
||||
```
|
||||
|
||||
To enable Xnack functionality at runtime on a per-application basis,
|
||||
use environment variable:
|
||||
|
||||
```bash
|
||||
HSA_XNACK=1
|
||||
```
|
||||
|
||||
When Xnack support is not needed:
|
||||
|
||||
* Build the applications to maximize resource utilization using:
|
||||
|
||||
```bash
|
||||
--offload-arch=gfx908:xnack-
|
||||
```
|
||||
|
||||
* At runtime, set the `HSA_XNACK` environment variable to 0.
|
||||
|
||||
#### Unified shared memory pragma
|
||||
|
||||
This OpenMP pragma is available on MI200 through `xnack+` support.
|
||||
|
||||
```bash
|
||||
omp requires unified_shared_memory
|
||||
```
|
||||
|
||||
As stated in the OpenMP specifications, this pragma makes the map clause on
|
||||
target constructs optional. By default, on MI200, all memory allocated on the
|
||||
host is fine grain. Using the map clause on a target clause is allowed, which
|
||||
transforms the access semantics of the associated memory to coarse grain.
|
||||
|
||||
```bash
|
||||
A simple program demonstrating the use of this feature is:
|
||||
$ cat parallel_for.cpp
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define N 64
|
||||
#pragma omp requires unified_shared_memory
|
||||
int main() {
|
||||
int n = N;
|
||||
int *a = new int[n];
|
||||
int *b = new int[n];
|
||||
|
||||
for(int i = 0; i < n; i++)
|
||||
b[i] = i;
|
||||
|
||||
#pragma omp target parallel for map(to:b[:n])
|
||||
for(int i = 0; i < n; i++)
|
||||
a[i] = b[i];
|
||||
|
||||
for(int i = 0; i < n; i++)
|
||||
if(a[i] != i)
|
||||
printf("error at %d: expected %d, got %d\n", i, i+1, a[i]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
$ clang++ -O2 -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:xnack+ parallel_for.cpp
|
||||
$ HSA_XNACK=1 ./a.out
|
||||
```
|
||||
|
||||
In the above code example, pointer “a” is not mapped in the target region, while
|
||||
pointer “b” is. Both are valid pointers on the GPU device and passed by-value to
|
||||
the kernel implementing the target region. This means the pointer values on the
|
||||
host and the device are the same.
|
||||
|
||||
The difference between the memory pages pointed to by these two variables is
|
||||
that the pages pointed by “a” are in fine-grain memory, while the pages pointed
|
||||
to by “b” are in coarse-grain memory during and after the execution of the
|
||||
target region. This is accomplished in the OpenMP runtime library with calls to
|
||||
the ROCr runtime to set the pages pointed by “b” as coarse grain.
|
||||
|
||||
### OMPT target support
|
||||
|
||||
The OpenMP runtime in ROCm implements a subset of the OMPT device APIs, as
|
||||
described in the OpenMP specification document. These APIs allow first-party
|
||||
tools to examine the profile and kernel traces that execute on a device. A tool
|
||||
can register callbacks for data transfer and kernel dispatch entry points or use
|
||||
APIs to start and stop tracing for device-related activities such as data
|
||||
transfer and kernel dispatch timings and associated metadata. If device tracing
|
||||
is enabled, trace records for device activities are collected during program
|
||||
execution and returned to the tool using the APIs described in the
|
||||
specification.
|
||||
|
||||
The following example demonstrates how a tool uses the supported OMPT target
|
||||
APIs. The `README` in `/opt/rocm/llvm/examples/tools/ompt` outlines the steps to
|
||||
be followed, and the provided example can be run as shown below:
|
||||
|
||||
```bash
|
||||
cd $ROCM_PATH/share/openmp-extras/examples/tools/ompt/veccopy-ompt-target-tracing
|
||||
sudo make run
|
||||
```
|
||||
|
||||
The file `veccopy-ompt-target-tracing.c` simulates how a tool initiates device
|
||||
activity tracing. The file `callbacks.h` shows the callbacks registered and
|
||||
implemented by the tool.
|
||||
|
||||
### Floating point atomic operations
|
||||
|
||||
The MI200-series GPUs support the generation of hardware floating-point atomics
|
||||
using the OpenMP atomic pragma. The support includes single- and
|
||||
double-precision floating-point atomic operations. The programmer must ensure
|
||||
that the memory subjected to the atomic operation is in coarse-grain memory by
|
||||
mapping it explicitly with the help of map clauses when not implicitly mapped by
|
||||
the compiler as per the [OpenMP
|
||||
specifications](https://www.openmp.org/specifications/). This makes these
|
||||
hardware floating-point atomic instructions “fast,” as they are faster than
|
||||
using a default compare-and-swap loop scheme, but at the same time “unsafe,” as
|
||||
they are not supported on fine-grain memory. The operation in
|
||||
`unified_shared_memory` mode also requires programmers to map the memory
|
||||
explicitly when not implicitly mapped by the compiler.
|
||||
|
||||
To request fast floating-point atomic instructions at the file level, use
|
||||
compiler flag `-munsafe-fp-atomics` or a hint clause on a specific pragma:
|
||||
|
||||
```bash
|
||||
double a = 0.0;
|
||||
#pragma omp atomic hint(AMD_fast_fp_atomics)
|
||||
a = a + 1.0;
|
||||
```
|
||||
|
||||
:::{note}
|
||||
`AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and
|
||||
`AMD_safe_fp_atomics` is implemented with a compare-and-swap loop.
|
||||
:::
|
||||
|
||||
To disable the generation of fast floating-point atomic instructions at the file
|
||||
level, build using the option `-msafe-fp-atomics` or use a hint clause on a
|
||||
specific pragma:
|
||||
|
||||
```bash
|
||||
double a = 0.0;
|
||||
#pragma omp atomic hint(AMD_safe_fp_atomics)
|
||||
a = a + 1.0;
|
||||
```
|
||||
|
||||
The hint clause value always has a precedence over the compiler flag, which
|
||||
allows programmers to create atomic constructs with a different behavior than
|
||||
the rest of the file.
|
||||
|
||||
See the example below, where the user builds the program using
|
||||
`-msafe-fp-atomics` to select a file-wide “safe atomic” compilation. However,
|
||||
the fast atomics hint clause over variable “a” takes precedence and operates on
|
||||
“a” using a fast/unsafe floating-point atomic, while the variable “b” in the
|
||||
absence of a hint clause is operated upon using safe floating-point atomics as
|
||||
per the compiler flag.
|
||||
|
||||
```bash
|
||||
double a = 0.0;.
|
||||
#pragma omp atomic hint(AMD_fast_fp_atomics)
|
||||
a = a + 1.0;
|
||||
|
||||
double b = 0.0;
|
||||
#pragma omp atomic
|
||||
b = b + 1.0;
|
||||
```
|
||||
|
||||
### AddressSanitizer tool
|
||||
|
||||
AddressSanitizer (ASan) is a memory error detector tool utilized by applications to
|
||||
detect various errors ranging from spatial issues such as out-of-bound access to
|
||||
temporal issues such as use-after-free. The AOMP compiler supports ASan for AMD
|
||||
GPUs with applications written in both HIP and OpenMP.
|
||||
|
||||
**Features supported on host platform (Target x86_64):**
|
||||
|
||||
* Use-after-free
|
||||
* Buffer overflows
|
||||
* Heap buffer overflow
|
||||
* Stack buffer overflow
|
||||
* Global buffer overflow
|
||||
* Use-after-return
|
||||
* Use-after-scope
|
||||
* Initialization order bugs
|
||||
|
||||
**Features supported on AMDGPU platform (`amdgcn-amd-amdhsa`):**
|
||||
|
||||
* Heap buffer overflow
|
||||
* Global buffer overflow
|
||||
|
||||
**Software (kernel/OS) requirements:** Unified Shared Memory support with Xnack
|
||||
capability. See the section on [Unified Shared Memory](#unified-shared-memory)
|
||||
for prerequisites and details on Xnack.
|
||||
|
||||
**Example:**
|
||||
|
||||
* Heap buffer overflow
|
||||
|
||||
```bash
|
||||
void main() {
|
||||
....... // Some program statements
|
||||
....... // Some program statements
|
||||
#pragma omp target map(to : A[0:N], B[0:N]) map(from: C[0:N])
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for(int i =0 ; i < N; i++){
|
||||
C[i+10] = A[i] + B[i];
|
||||
} // end of for loop
|
||||
}
|
||||
....... // Some program statements
|
||||
}// end of main
|
||||
```
|
||||
|
||||
See the complete sample code for heap buffer overflow
|
||||
[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/heap_buffer_overflow/openmp/vecadd-HBO.cpp).
|
||||
|
||||
* Global buffer overflow
|
||||
|
||||
```bash
|
||||
#pragma omp declare target
|
||||
int A[N],B[N],C[N];
|
||||
#pragma omp end declare target
|
||||
void main(){
|
||||
...... // some program statements
|
||||
...... // some program statements
|
||||
#pragma omp target data map(to:A[0:N],B[0:N]) map(from: C[0:N])
|
||||
{
|
||||
#pragma omp target update to(A,B)
|
||||
#pragma omp target parallel for
|
||||
for(int i=0; i<N; i++){
|
||||
C[i]=A[i*100]+B[i+22];
|
||||
} // end of for loop
|
||||
#pragma omp target update from(C)
|
||||
}
|
||||
........ // some program statements
|
||||
} // end of main
|
||||
```
|
||||
|
||||
See the complete sample code for global buffer overflow
|
||||
[here](https://github.com/ROCm/aomp/blob/aomp-dev/examples/tools/asan/global_buffer_overflow/openmp/vecadd-GBO.cpp).
|
||||
|
||||
### Clang compiler option for kernel optimization
|
||||
|
||||
You can use the clang compiler option `-fopenmp-target-fast` for kernel optimization if certain constraints implied by its component options are satisfied. `-fopenmp-target-fast` enables the following options:
|
||||
|
||||
* `-fopenmp-target-ignore-env-vars`: It enables code generation of specialized kernels including no-loop and Cross-team reductions.
|
||||
|
||||
* `-fopenmp-assume-no-thread-state`: It enables the compiler to assume that no thread in a parallel region modifies an Internal Control Variable (`ICV`), thus potentially reducing the device runtime code execution.
|
||||
|
||||
* `-fopenmp-assume-no-nested-parallelism`: It enables the compiler to assume that no thread in a parallel region encounters a parallel region, thus potentially reducing the device runtime code execution.
|
||||
|
||||
* `-O3` if no `-O*` is specified by the user.
|
||||
|
||||
### Specialized kernels
|
||||
|
||||
Clang will attempt to generate specialized kernels based on compiler options and OpenMP constructs. The following specialized kernels are supported:
|
||||
|
||||
* No-loop
|
||||
* Big-jump-loop
|
||||
* Cross-team reductions
|
||||
|
||||
To enable the generation of specialized kernels, follow these guidelines:
|
||||
|
||||
* Do not specify teams, threads, and schedule-related environment variables. The `num_teams` clause in an OpenMP target construct acts as an override and prevents the generation of the no-loop kernel. If the specification of `num_teams` clause is a user requirement then clang tries to generate the big-jump-loop kernel instead of the no-loop kernel.
|
||||
|
||||
* Assert the absence of the teams, threads, and schedule-related environment variables by adding the command-line option `-fopenmp-target-ignore-env-vars`.
|
||||
|
||||
* To automatically enable the specialized kernel generation, use `-Ofast` or `-fopenmp-target-fast` for compilation.
|
||||
|
||||
* To disable specialized kernel generation, use `-fno-openmp-target-ignore-env-vars`.
|
||||
|
||||
#### No-loop kernel generation
|
||||
|
||||
The no-loop kernel generation feature optimizes the compiler performance by generating a specialized kernel for certain OpenMP target constructs such as target teams distribute parallel for. The specialized kernel generation feature assumes every thread executes a single iteration of the user loop, which leads the runtime to launch a total number of GPU threads equal to or greater than the iteration space size of the target region loop. This allows the compiler to generate code for the loop body without an enclosing loop, resulting in reduced control-flow complexity and potentially better performance.
|
||||
|
||||
#### Big-jump-loop kernel generation
|
||||
|
||||
A no-loop kernel is not generated if the OpenMP teams construct uses a `num_teams` clause. Instead, the compiler attempts to generate a different specialized kernel called the big-jump-loop kernel. The compiler launches the kernel with a grid size determined by the number of teams specified by the OpenMP `num_teams` clause and the `blocksize` chosen either by the compiler or specified by the corresponding OpenMP clause.
|
||||
|
||||
#### Cross-team optimized reduction kernel generation
|
||||
|
||||
If the OpenMP construct has a reduction clause, the compiler attempts to generate optimized code by utilizing efficient cross-team communication. New APIs for cross-team reduction are implemented in the device runtime and are automatically generated by clang.
|
||||
@@ -17,7 +17,7 @@ following section.
|
||||
|
||||
## ROCm component licenses
|
||||
|
||||
ROCm is released by Advanced Micro Devices, Inc. (AMD) and is licensed per component separately.
|
||||
ROCm is released by Advanced Micro Devices, Inc. and is licensed per component separately.
|
||||
The following table is a list of ROCm components with links to their respective license
|
||||
terms. These components may include third party components subject to
|
||||
additional licenses. Please review individual repositories for more information.
|
||||
@@ -25,74 +25,66 @@ additional licenses. Please review individual repositories for more information.
|
||||
<!-- spellcheck-disable -->
|
||||
| Component | License |
|
||||
|:---------------------|:-------------------------|
|
||||
| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/LICENSE.md) |
|
||||
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
|
||||
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
|
||||
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
|
||||
| [AQLprofile](https://github.com/ROCm/rocm-systems/tree/develop/projects/aqlprofile/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/aqlprofile/LICENSE.md) |
|
||||
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
|
||||
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
|
||||
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
|
||||
| [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/hip/LICENSE.md) |
|
||||
| [hipamd](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/hipamd/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/hipamd/LICENSE.md) |
|
||||
| [hipBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblas/LICENSE.md) |
|
||||
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
||||
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
||||
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
||||
| [AMDMIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
||||
| [HIPCC](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) | [MIT](https://github.com/ROCm/HIPCC/blob/develop/LICENSE.txt) |
|
||||
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
||||
| [hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiprand/LICENSE.md) |
|
||||
| [hipSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsolver/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparse/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparselt/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiptensor/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiptensor/LICENSE) |
|
||||
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
|
||||
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
|
||||
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
||||
| [MIOpen](https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
|
||||
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
|
||||
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocblas/LICENSE.md) |
|
||||
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
|
||||
| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
|
||||
| [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/LICENSE.md) |
|
||||
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v3.0](https://github.com/ROCm/ROCgdb/blob/amd-staging/COPYING3) |
|
||||
| [rocJPEG](https://github.com/ROCm/rocJPEG/) | [MIT](https://github.com/ROCm/rocJPEG/blob/develop/LICENSE) |
|
||||
| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/develop/LICENSE.txt) |
|
||||
| [MIOpenGEMM](https://github.com/ROCm/MIOpenGEMM/) | [MIT](https://github.com/ROCm/MIOpenGEMM/blob/master/LICENSE.txt) |
|
||||
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/master/LICENSE.txt) |
|
||||
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/master/LICENSE.txt) |
|
||||
| [RCP](https://github.com/GPUOpen-Tools/radeon_compute_profiler/) | [MIT](https://github.com/GPUOpen-Tools/radeon_compute_profiler/blob/master/LICENSE) |
|
||||
| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
|
||||
| [rocminfo](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocminfo/License.txt) |
|
||||
| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [MIT](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
|
||||
| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
|
||||
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
||||
| [ROCm-Core](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-core/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-core/LICENSE.md) |
|
||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-compute/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute/LICENSE.md) |
|
||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rocm-systems/tree/develop/projects/rdc/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rdc/LICENSE.md) |
|
||||
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/opencl/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/opencl/LICENSE.md) |
|
||||
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
|
||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-smi-lib/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-smi-lib/LICENSE.md) |
|
||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-systems/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-systems/LICENSE.md) |
|
||||
| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocprim/LICENSE.md) |
|
||||
| [ROCProfiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler/LICENSE.md) |
|
||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-sdk/LICENSE.md) |
|
||||
| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
|
||||
| [rocRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocrand/LICENSE.md) |
|
||||
| [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocr-runtime/LICENSE.txt) |
|
||||
| [rocSHMEM](https://github.com/ROCm/rocSHMEM/) | [MIT](https://github.com/ROCm/rocSHMEM/blob/develop/LICENSE.md) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver/) | [BSD-2-Clause](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsolver/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsparse/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust/) | [Apache 2.0](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocthrust/LICENSE) |
|
||||
| [ROCTracer](https://github.com/ROCm/rocm-systems/tree/develop/projects/roctracer/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/roctracer/LICENSE.md) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocwmma/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocwmma/LICENSE.md) |
|
||||
| [Tensile](https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/shared/tensile/LICENSE.md) |
|
||||
| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/master/LICENSE.txt) |
|
||||
| [ROCT-Thunk-Interface](https://github.com/ROCm/ROCT-Thunk-Interface/) | [MIT](https://github.com/ROCm/ROCT-Thunk-Interface/blob/master/LICENSE.md) |
|
||||
| [ROCclr](https://github.com/ROCm/ROCclr/) | [MIT](https://github.com/ROCm/ROCclr/blob/develop/LICENSE.txt) |
|
||||
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-master/LICENSE.txt) |
|
||||
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v2.0](https://github.com/ROCm/ROCgdb/blob/amd-master/COPYING) |
|
||||
| [ROCm-CompilerSupport](https://github.com/ROCm/ROCm-CompilerSupport/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-CompilerSupport/blob/amd-stg-open/LICENSE.txt) |
|
||||
| [ROCm-Device-Libs](https://github.com/ROCm/ROCm-Device-Libs/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCm-Device-Libs/blob/amd-stg-open/LICENSE.TXT) |
|
||||
| [ROCm-OpenCL-Runtime/api/opencl/khronos/icd](https://github.com/KhronosGroup/OpenCL-ICD-Loader/) | [Apache 2.0](https://github.com/KhronosGroup/OpenCL-ICD-Loader/blob/main/LICENSE) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/ROCm-OpenCL-Runtime/) | [MIT](https://github.com/ROCm/ROCm-OpenCL-Runtime/blob/develop/LICENSE.txt) |
|
||||
| [ROCmValidationSuite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
||||
| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
|
||||
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
|
||||
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
|
||||
| [atmi](https://github.com/ROCm/atmi/) | [MIT](https://github.com/ROCm/atmi/blob/master/LICENSE.txt) |
|
||||
| [clang-ocl](https://github.com/ROCm/clang-ocl/) | [MIT](https://github.com/ROCm/clang-ocl/blob/master/LICENSE) |
|
||||
| [flang](https://github.com/ROCm/flang/) | [Apache 2.0](https://github.com/ROCm/flang/blob/master/LICENSE.txt) |
|
||||
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/master/LICENSE.txt) |
|
||||
| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
|
||||
| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
|
||||
| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
|
||||
| [hipamd](https://github.com/ROCm/hipamd/) | [MIT](https://github.com/ROCm/hipamd/blob/develop/LICENSE.txt) |
|
||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/master/LICENSE) |
|
||||
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/main/LICENSE.TXT) |
|
||||
| [rccl](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
||||
| [rdc](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/master/LICENSE) |
|
||||
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
|
||||
| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
|
||||
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
|
||||
| [rocm-cmake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
|
||||
| [rocm_bandwidth_test](https://github.com/ROCm/rocm_bandwidth_test/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
|
||||
| [rocm_smi_lib](https://github.com/ROCm/rocm_smi_lib/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm_smi_lib/blob/master/License.txt) |
|
||||
| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/master/License.txt) |
|
||||
| [rocprofiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-master/LICENSE) |
|
||||
| [rocr_debug_agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/master/LICENSE.txt) |
|
||||
| [roctracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
|
||||
| rocm-llvm-alt | [AMD Proprietary License](https://www.amd.com/en/support/amd-software-eula)
|
||||
|
||||
Open sourced ROCm components are released via public GitHub
|
||||
repositories, packages on [https://repo.radeon.com](https://repo.radeon.com) and other distribution channels.
|
||||
Proprietary products are only available on [https://repo.radeon.com](https://repo.radeon.com).
|
||||
repositories, packages on https://repo.radeon.com and other distribution channels.
|
||||
Proprietary products are only available on https://repo.radeon.com. Currently, only
|
||||
one component of ROCm, rocm-llvm-alt is governed by a proprietary license.
|
||||
Proprietary components are organized in a proprietary subdirectory in the package
|
||||
repositories to distinguish from open sourced packages.
|
||||
|
||||
@@ -100,7 +92,7 @@ repositories to distinguish from open sourced packages.
|
||||
The following additional terms and conditions apply to your use of ROCm technical documentation.
|
||||
```
|
||||
|
||||
©2023 - 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
©2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
The information presented in this document is for informational purposes only
|
||||
and may contain technical inaccuracies, omissions, and typographical errors. The
|
||||
@@ -132,16 +124,19 @@ companies.
|
||||
### Package licensing
|
||||
|
||||
:::{attention}
|
||||
ROCprof Trace Decoder and AOCC CPU optimizations are provided in binary form, subject to the license agreement enclosed on [GitHub](https://github.com/ROCm/rocprof-trace-decoder/blob/amd-mainline/LICENSE) for ROCprof Trace Decoder, and [Developer Central](https://www.amd.com/en/developer/aocc.html) for AOCC. By using, installing,
|
||||
copying or distributing ROCprof Trace Decoder or AOCC CPU Optimizations, you agree to
|
||||
AQL Profiler and AOCC CPU optimization are both provided in binary form, each
|
||||
subject to the license agreement enclosed in the directory for the binary and is
|
||||
available here: `/opt/rocm/share/doc/rocm-llvm-alt/EULA`. By using, installing,
|
||||
copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
|
||||
the terms and conditions of this license agreement. If you do not agree to the
|
||||
terms of this agreement, do not install, copy or use ROCprof Trace Decoder or the
|
||||
terms of this agreement, do not install, copy or use the AQL Profiler and/or the
|
||||
AOCC CPU Optimizations.
|
||||
:::
|
||||
|
||||
For the rest of the ROCm packages, you can find the licensing information at the
|
||||
following location: `/opt/rocm/share/doc/<component-name>/` or in the locations
|
||||
specified in the preceding table.
|
||||
following location: `/opt/rocm/share/doc/<component-name>/`
|
||||
|
||||
For example, you can fetch the licensing information of the `amd_comgr`
|
||||
component (Code Object Manager) from the `/opt/rocm/share/doc/amd_comgr/LICENSE.txt` file.
|
||||
For example, you can fetch the licensing information of the `_amd_comgr_`
|
||||
component (Code Object Manager) from the `amd_comgr` folder. A file named
|
||||
`LICENSE.txt` contains the license details at:
|
||||
`/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt`
|
||||
|
||||
@@ -1,136 +0,0 @@
|
||||
ROCm Version,7.2.0,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
|
||||
,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
|
||||
,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9.1, 2.8.0, 2.7.1","2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.8.0,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.2,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.30.0, 30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.15.0,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.5.0,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.5.0,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.5.0,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.3.0,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.8.0,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.2.0,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.2.0,3.1.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.2.0,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.2.1,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.22,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.2.0,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.2.0,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.6,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.1.0,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.2.0,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.36,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.32.0,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.2.0,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.2.0,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.2.0,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.2.0,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.2.0,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.2.0,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.1,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.4.0,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.3.0,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70200,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.1.0,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70200,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,22.0.0,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,22.0.0.26014,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.2.26015,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
@@ -1,221 +0,0 @@
|
||||
.. meta::
|
||||
:description: ROCm compatibility matrix
|
||||
:keywords: GPU, architecture, hardware, compatibility, system, requirements, components, libraries
|
||||
|
||||
**************************************************************************************
|
||||
Compatibility matrix
|
||||
**************************************************************************************
|
||||
|
||||
Use this matrix to view the ROCm compatibility and system requirements across successive major and minor releases.
|
||||
|
||||
You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.
|
||||
|
||||
GPUs listed in the following table support compute workloads (no display
|
||||
information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
|
||||
workloads, see the :doc:`Use ROCm on Radeon and Ryzen <radeon:index>` to verify
|
||||
compatibility and system requirements.
|
||||
|
||||
.. |br| raw:: html
|
||||
|
||||
<br/>
|
||||
|
||||
.. container:: format-big-table
|
||||
|
||||
.. csv-table::
|
||||
:header: "ROCm Version", "7.2.0", "7.1.1", "6.4.0"
|
||||
:stub-columns: 1
|
||||
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
|
||||
,"Debian 13, 12","Debian 13, 12",Debian 12
|
||||
,,,Azure Linux 3.0
|
||||
,Rocky Linux 9,Rocky Linux 9,
|
||||
,.. _architecture-support-compatibility-matrix:,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
|
||||
,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,
|
||||
,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix:,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
|
||||
,gfx1201,gfx1201,
|
||||
,gfx1200,gfx1200,
|
||||
,gfx1101,gfx1101,
|
||||
,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942
|
||||
,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908
|
||||
,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9.1, 2.8.0, 2.7.1","2.9, 2.8, 2.7","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.8.0,0.7.1,0.4.35
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat]_,N/A,v0.2.5,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.2,1.23.1,1.20.0
|
||||
,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
|
||||
,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
|
||||
Thrust,2.8.5,2.8.5,2.5.0
|
||||
CUB,2.8.5,2.8.5,2.5.0
|
||||
,,,
|
||||
DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.30.0, 30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.2.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.15.0,2.14.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.5.0,3.4.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.5.0,2.4.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.5.0,1.4.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.3.0,1.2.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.8.0,0.7.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.2.0,2.1.0,1.9.10
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.2.0,3.1.0,2.0.0
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.2.0,3.1.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.2.1,1.1.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.22,1.0.21,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.2.0,3.1.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.2.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.6,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.1.0,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.2.0,5.1.1,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.36,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.2.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.32.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.2.0,2.1.0,1.7.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.2.0,2.0.0,1.5.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.2.0,4.1.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.2.0,4.1.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.2.26015,7.1.52802,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.2.0,7.1.1,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.1,26.2.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.3.0,1.1.0
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.4.0,3.3.1,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.3.0,1.2.1,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70200,2.0.70101,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.1.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70200,4.1.70101,4.1.60400
|
||||
,,,
|
||||
DEVELOPMENT TOOLS,,,
|
||||
:doc:`HIPIFY <hipify:index>`,22.0.0,20.0.0,19.0.0
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.2
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
|
||||
,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,22.0.0.26014,20.0.025444,19.0.0.25133
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.2.26015,7.1.52802,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.2.26015,7.1.52802,6.4.43482
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility] Some operating systems are supported on specific GPUs. For detailed information about operating systems supported on ROCm 7.2.0, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information about GPUs supporting ROCm 7.2.0, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#dgl_compat] DGL is only supported on ROCm 7.0.0, 6.4.3 and 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat] FlashInfer is only supported on ROCm 7.1.1 and 6.4.1.
|
||||
.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
.. _OS-kernel-versions:
|
||||
|
||||
Operating systems, kernel and Glibc versions
|
||||
*********************************************
|
||||
|
||||
For detailed information on operating system supported on ROCm 7.2.0 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
* See `Red Hat Enterprise Linux Release Dates <https://access.redhat.com/articles/3078>`_ to learn about the specific kernel versions supported on Red Hat Enterprise Linux (RHEL).
|
||||
* See `List of SUSE Linux Enterprise Server kernel <https://www.suse.com/support/kb/doc/?id=000019587>`_ to learn about the specific kernel version supported on SUSE Linux Enterprise Server (SLES).
|
||||
..
|
||||
Footnotes and ref anchors in below historical tables should be appended with "-past-60", to differentiate from the
|
||||
footnote references in the above, latest, compatibility matrix. It also allows to easily find & replace.
|
||||
An easy way to work is to download the historical.CSV file, and update open it in excel. Then when content is ready,
|
||||
delete the columns you don't need, to build the current compatibility matrix to use in above table. Find & replace all
|
||||
instances of "-past-60" to make it ready for above table.
|
||||
|
||||
|
||||
.. _past-rocm-compatibility-matrix:
|
||||
|
||||
Past versions of ROCm compatibility matrix
|
||||
***************************************************
|
||||
|
||||
Expand for full historical view of:
|
||||
|
||||
.. dropdown:: ROCm 6.0 - Present
|
||||
|
||||
You can `download the entire .csv <../downloads/compatibility-matrix-historical-6.0.csv>`_ for offline reference.
|
||||
|
||||
.. csv-table::
|
||||
:file: compatibility-matrix-historical-6.0.csv
|
||||
:header-rows: 1
|
||||
:stub-columns: 1
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on specific GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
|
||||
.. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#verl_compat-past-60] verl is only supported on ROCm 7.0.0 and 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is only supported on ROCm 7.0.0, 6.4.3 and 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
|
||||
.. [#ray_compat-past-60] Ray is only supported on ROCm 7.0.0 and 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 7.1.1 and 6.4.1.
|
||||
.. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
@@ -1,364 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Deep Graph Library (DGL) compatibility
|
||||
:keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
DGL compatibility
|
||||
********************************************************************************
|
||||
|
||||
Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable
|
||||
Python package for deep learning on graphs. DGL is framework agnostic, meaning
|
||||
that if a deep graph model is a component in an end-to-end application, the rest of
|
||||
the logic is implemented using PyTorch.
|
||||
|
||||
DGL provides a high-performance graph object that can reside on either CPUs or GPUs.
|
||||
It bundles structural data features for better control and provides a variety of functions
|
||||
for computing with graph objects, including efficient and customizable message passing
|
||||
primitives for Graph Neural Networks.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl
|
||||
<https://github.com/ROCm/dgl>`__ repository, which differs from the
|
||||
`https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.
|
||||
|
||||
- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`,
|
||||
which include ROCm, DGL, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__
|
||||
for additional context.
|
||||
|
||||
.. _dgl-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest available DGL version from the official Docker Hub.
|
||||
Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- DGL
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.8.0/images/sha256-943698ddf54c22a7bcad2e5b4ff467752e29e4ba6d0c926789ae7b242cbd92dd"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-b2ec286a035eb7d0a6aab069561914d21a3cac462281e9c024501ba5ccedfbf7"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu22.04_py3.10_pytorch_2.7.1/images/sha256-d27aee16df922ccf0bcd9107bfcb6d20d34235445d456c637e33ca6f19d11a51"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm6.4.3_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-f3ba6a3c9ec9f6c1cde28449dc9780e0c4c16c4140f4b23f158565fbfd422d6b"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
|
||||
.. _dgl-key-rocm-libraries:
|
||||
|
||||
Key ROCm libraries for DGL
|
||||
================================================================================
|
||||
|
||||
DGL on ROCm depends on specific libraries that affect its features and performance.
|
||||
Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
|
||||
If you prefer to build it yourself, ensure the following dependencies are installed:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- ROCm 7.0.0 Version
|
||||
- ROCm 6.4.x Version
|
||||
- Purpose
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- 1.1.0
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 3.0.0
|
||||
- 2.4.0
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 1.0.0
|
||||
- 0.12.0
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 4.0.0
|
||||
- 3.4.0
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.20
|
||||
- 1.0.18
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 3.0.0
|
||||
- 2.12.0
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 3.0.0
|
||||
- 2.4.0
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 4.0.1
|
||||
- 3.2.0
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.4
|
||||
- 0.2.3
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 2.0.0
|
||||
- 1.5.0
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.5.0
|
||||
- 3.4.0
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.13.0
|
||||
- 2.12.0
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.3.0
|
||||
- 3.2.0
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 3.3.0
|
||||
- 2.2.0
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.26.6
|
||||
- 2.22.3
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 1.0.0
|
||||
- 0.10.0
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 1.1.0
|
||||
- 0.8.0
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 2.0.0
|
||||
- 1.9.10
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 4.0.0
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 2.0.0
|
||||
- 1.7.0
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
|
||||
.. _dgl-supported-features-latest:
|
||||
|
||||
Supported features with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
Many functions and methods available upstream are also supported in DGL on ROCm.
|
||||
Instead of listing them all, support is grouped into the following categories to provide a general overview.
|
||||
|
||||
* DGL Base
|
||||
* DGL Backend
|
||||
* DGL Data
|
||||
* DGL Dataloading
|
||||
* DGL Graph
|
||||
* DGL Function
|
||||
* DGL Ops
|
||||
* DGL Sampling
|
||||
* DGL Transforms
|
||||
* DGL Utils
|
||||
* DGL Distributed
|
||||
* DGL Geometry
|
||||
* DGL Mpops
|
||||
* DGL NN
|
||||
* DGL Optim
|
||||
* DGL Sparse
|
||||
* GraphBolt
|
||||
|
||||
.. _dgl-unsupported-features-latest:
|
||||
|
||||
Unsupported features with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
* TF32 Support (only supported for PyTorch 2.7 and above)
|
||||
* Kineto/ROCTracer integration
|
||||
|
||||
.. _dgl-unsupported-functions:
|
||||
|
||||
Unsupported functions with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
* ``bfs``
|
||||
* ``format``
|
||||
* ``multiprocess_sparse_adam_state_dict``
|
||||
* ``half_spmm``
|
||||
* ``segment_mm``
|
||||
* ``gather_mm_idx_b``
|
||||
* ``sample_labors_prob``
|
||||
* ``sample_labors_noprob``
|
||||
* ``sparse_admin``
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
- 1D (Temporal) and 2D (Image) Classification
|
||||
- Drug Discovery
|
||||
|
||||
For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
* Although multiple use cases of DGL have been tested and verified, a few have been
|
||||
outlined in the `DGL in the Real World: Running GNNs on Real Use Cases
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog
|
||||
post, which walks through four real-world graph neural network (GNN) workloads
|
||||
implemented with the Deep Graph Library on ROCm. It covers tasks ranging from
|
||||
heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph
|
||||
regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use
|
||||
case, the authors detail: the dataset and task, how DGL is used, and their experience
|
||||
porting to ROCm. It is shown that DGL codebases often run without modification, with
|
||||
seamless integration of graph operations, message passing, sampling, and convolution.
|
||||
|
||||
* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__
|
||||
blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform,
|
||||
bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges
|
||||
the gap between dense tensor frameworks and the irregular nature of graph data through a
|
||||
graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and
|
||||
interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration
|
||||
enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers
|
||||
and open-source repositories. This marks a major step in AMD's mission to advance open,
|
||||
scalable AI ecosystems beyond traditional architectures.
|
||||
|
||||
You can pre-process datasets and begin training on AMD GPUs through:
|
||||
|
||||
* Single-GPU training/inference
|
||||
* Multi-GPU training
|
||||
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
|
||||
of the ``ROCm/dgl`` Docker image.
|
||||
@@ -1,113 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: FlashInfer compatibility
|
||||
:keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
FlashInfer compatibility
|
||||
********************************************************************************
|
||||
|
||||
`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator
|
||||
for Large Language Models (LLMs) that provides a high-performance implementation of graphics
|
||||
processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well
|
||||
as advanced performance across diverse scenarios.
|
||||
|
||||
FlashInfer features highly efficient attention kernels, load-balanced scheduling, and memory-optimized
|
||||
techniques, while supporting customized attention variants. It’s compatible with ``torch.compile``, and
|
||||
offers high-performance LLM-specific operators, with easy integration through PyTorch, and C++ APIs.
|
||||
|
||||
.. note::
|
||||
|
||||
The ROCm port of FlashInfer is under active development, and some features are not yet available.
|
||||
For the latest feature compatibility matrix, refer to the ``README`` of the
|
||||
`https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer
|
||||
<https://github.com/ROCm/flashinfer>`__ repository, which differs from the
|
||||
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__
|
||||
upstream repository.
|
||||
|
||||
- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`,
|
||||
which include ROCm, FlashInfer, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
.. _flashinfer-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available FlashInfer version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- FlashInfer
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5.amd2_rocm7.1.1_ubuntu24.04_py3.12_pytorch2.8/images/sha256-9ab6426750a11dbab9bcddeaccaf492683bfd96a1d60b21dd9fc3a609a98175b"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
|
||||
- `7.1.1 <https://repo.radeon.com/rocm/apt/7.1.1/>`__
|
||||
- `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
|
||||
- `2.8.0 <https://github.com/ROCm/pytorch/releases/tag/v2.8.0>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI325X, MI300X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5_rocm6.4_ubuntu24.04_py3.12_pytorch2.7/images/sha256-558914838821c88c557fb6d42cfbc1bdb67d79d19759f37c764a9ee801f93313"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
|
||||
- `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X
|
||||
|
||||
.. _flashinfer-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
FlashInfer on ROCm enables you to perform LLM inference for both prefill and decode:
|
||||
during prefill, your model efficiently processes input prompts to build KV caches
|
||||
and internal activations; during decode, it generates tokens sequentially based on
|
||||
prior outputs and context. Use the attention mode supported upstream (Multi-Head
|
||||
Attention, Grouped-Query Attention, or Multi-Query Attention) that matches your
|
||||
model configuration.
|
||||
|
||||
FlashInfer on ROCm also includes capabilities such as load balancing,
|
||||
sparse and dense attention optimizations, and single and batch decode, alongside
|
||||
prefill for high‑performance execution on MI300X GPUs.
|
||||
|
||||
For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/search.html?q=flashinfer>`__,
|
||||
where you can search for examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/flashinfer-history` to find documentation for previous releases
|
||||
of the ``ROCm/flashinfer`` Docker image.
|
||||
@@ -1,349 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library
|
||||
for array-oriented numerical computation (similar to NumPy), with automatic differentiation
|
||||
and just-in-time (JIT) compilation to enable high-performance machine learning research.
|
||||
|
||||
JAX provides an API that combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale. JAX uses composable transformations of Python and NumPy through
|
||||
JIT compilation, automatic vectorization, and parallelization.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax
|
||||
<https://github.com/ROCm/rocm-jax>`__ repository, which differs from the
|
||||
`https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.
|
||||
|
||||
- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`,
|
||||
which include ROCm, JAX, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
|
||||
JAX Plugin-PJRT with JAX/JAXLIB compatibility
|
||||
================================================================================
|
||||
|
||||
Portable JIT Runtime (PJRT) is an open, stable interface for device runtime and
|
||||
compiler. The following table details the ROCm version compatibility matrix
|
||||
between JAX Plugin–PJRT and JAX/JAXLIB.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - JAX Plugin-PJRT
|
||||
- JAX/JAXLIB
|
||||
- ROCm
|
||||
* - 0.8.0
|
||||
- 0.8.0
|
||||
- 7.2.0
|
||||
* - 0.7.1
|
||||
- 0.7.1
|
||||
- 7.1.1, 7.1.0
|
||||
* - 0.6.0
|
||||
- 0.6.2, 0.6.0
|
||||
- 7.0.2, 7.0.1, 7.0.0
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
|
||||
blog explores the implementation and training of a Generative Pre-trained
|
||||
Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
|
||||
nanoGPT. Comparing how essential GPT components—such as self-attention
|
||||
mechanisms and optimizers—are realized in JAX and JAX, also highlights
|
||||
JAX’s unique features.
|
||||
|
||||
* The `Optimize GPT Training: Enabling Mixed Precision Training in JAX using
|
||||
ROCm on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-mixed-precision/README.html>`_
|
||||
blog post provides a comprehensive guide on enhancing the training efficiency
|
||||
of GPT models by implementing mixed precision techniques in JAX, specifically
|
||||
tailored for AMD GPUs utilizing the ROCm platform.
|
||||
|
||||
* The `Supercharging JAX with Triton Kernels on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/jax-triton/README.html>`_
|
||||
blog demonstrates how to develop a custom fused dropout-activation kernel for
|
||||
matrices using Triton, integrate it with JAX, and benchmark its performance
|
||||
using ROCm.
|
||||
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discusses techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
used on a multi-GPU setup.
|
||||
|
||||
* The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
GPU using ROCm. The page is aimed at helping users achieve optimal
|
||||
performance for deep learning and other high-performance computing tasks on
|
||||
the MI300X GPU.
|
||||
|
||||
For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.blogs.amd.com/blog/tag/jax.html>`_.
|
||||
|
||||
.. _jax-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
|
||||
For ``jax-community`` images, see `rocm/jax-community
|
||||
<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
|
||||
|
||||
To find the right image tag, see the :ref:`JAX on ROCm installation
|
||||
documentation <rocm-install-on-linux:jax-docker-support>` for a list of
|
||||
available ``rocm/jax`` images.
|
||||
|
||||
.. _key_rocm_libraries:
|
||||
|
||||
Key ROCm libraries for JAX
|
||||
================================================================================
|
||||
|
||||
The following ROCm libraries represent potential targets that could be utilized
|
||||
by JAX on ROCm for various computational tasks. The actual libraries used will
|
||||
depend on the specific implementation and operations performed.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of hipBLAS, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimized for deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimized for multi-GPU communication for operations like all-reduce,
|
||||
broadcast, and scatter.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
|
||||
.. note::
|
||||
|
||||
This table shows ROCm libraries that could potentially be utilized by JAX. Not
|
||||
all libraries may be used in every configuration, and the actual library usage
|
||||
will depend on the specific operations and implementation details.
|
||||
|
||||
Supported data types and modules
|
||||
===============================================================================
|
||||
|
||||
The following tables lists the supported public JAX API data types and modules.
|
||||
|
||||
Supported data types
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
|
||||
module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
|
||||
and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
|
||||
The ROCm supported data types in JAX are collected in the following table.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
|
||||
* - ``bfloat16``
|
||||
- 16-bit bfloat (brain floating point).
|
||||
|
||||
* - ``bool``
|
||||
- Boolean.
|
||||
|
||||
* - ``complex128``
|
||||
- 128-bit complex.
|
||||
|
||||
* - ``complex64``
|
||||
- 64-bit complex.
|
||||
|
||||
* - ``float16``
|
||||
- 16-bit (half precision) floating-point.
|
||||
|
||||
* - ``float32``
|
||||
- 32-bit (single precision) floating-point.
|
||||
|
||||
* - ``float64``
|
||||
- 64-bit (double precision) floating-point.
|
||||
|
||||
* - ``half``
|
||||
- 16-bit (half precision) floating-point.
|
||||
|
||||
* - ``int16``
|
||||
- Signed 16-bit integer.
|
||||
|
||||
* - ``int32``
|
||||
- Signed 32-bit integer.
|
||||
|
||||
* - ``int64``
|
||||
- Signed 64-bit integer.
|
||||
|
||||
* - ``int8``
|
||||
- Signed 8-bit integer.
|
||||
|
||||
* - ``uint16``
|
||||
- Unsigned 16-bit (word) integer.
|
||||
|
||||
* - ``uint32``
|
||||
- Unsigned 32-bit (dword) integer.
|
||||
|
||||
* - ``uint64``
|
||||
- Unsigned 64-bit (qword) integer.
|
||||
|
||||
* - ``uint8``
|
||||
- Unsigned 8-bit (byte) integer.
|
||||
|
||||
.. note::
|
||||
|
||||
JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
|
||||
collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
|
||||
page.
|
||||
|
||||
Supported modules
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
|
||||
``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
|
||||
`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
|
||||
|
||||
.. note::
|
||||
|
||||
Since version 0.1.56, JAX has full support for ROCm, and the
|
||||
:ref:`Known issues and important notes <jax_comp_known_issues>` section
|
||||
contains details about limitations specific to the ROCm backend. The list of
|
||||
JAX API modules are maintained by the JAX project and is subject to change.
|
||||
Refer to the official Jax documentation for the most up-to-date information.
|
||||
|
||||
Key features and enhancements for ROCm 7.1
|
||||
===============================================================================
|
||||
|
||||
- Enabled compilation of multihost HLO runner Python bindings.
|
||||
|
||||
- Backported multihost HLO runner bindings and some related changes to
|
||||
:code:`FunctionalHloRunner`.
|
||||
|
||||
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
||||
|
||||
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
||||
|
||||
|
||||
- ROCprofiler-SDK integration:
|
||||
|
||||
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
||||
support both time-based and step-based profiling.
|
||||
|
||||
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
||||
|
||||
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
||||
rounding mode.
|
||||
|
||||
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
||||
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
||||
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
||||
|
||||
Key features and enhancements for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
- Upgraded XLA backend: Integrates a newer XLA version, enabling better
|
||||
optimizations, broader operator support, and potential performance gains.
|
||||
|
||||
- RNN support: Native RNN support (including LSTMs via ``jax.experimental.rnn``)
|
||||
now available on ROCm, aiding sequence model development.
|
||||
|
||||
- Comprehensive linear algebra capabilities: Offers robust ``jax.linalg``
|
||||
operations, essential for scientific and machine learning tasks.
|
||||
|
||||
- Expanded AMD GPU architecture support: Provides ongoing support for gfx1101
|
||||
GPUs and introduces support for gfx950 and gfx12xx GPUs.
|
||||
|
||||
- Mixed FP8 precision support: Enables ``lax.dot_general`` operations with mixed FP8
|
||||
types, offering pathways for memory and compute efficiency.
|
||||
|
||||
- Streamlined PyPi packaging: Provides reliable PyPi wheels for JAX on ROCm,
|
||||
simplifying the installation process.
|
||||
|
||||
- Pallas experimental kernel development: Continued Pallas framework
|
||||
enhancements for custom GPU kernels, including new intrinsics (specific
|
||||
kernel behaviors under review).
|
||||
|
||||
- Improved build system and CI: Enhanced ROCm build system and CI for greater
|
||||
reliability and maintainability.
|
||||
|
||||
- Enhanced distributed computing setup: Improved JAX setup in multi-GPU
|
||||
distributed environments.
|
||||
|
||||
.. _jax_comp_known_issues:
|
||||
|
||||
Known issues and notes for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
- ``nn.dot_product_attention``: Certain configurations of ``jax.nn.dot_product_attention``
|
||||
may cause segmentation faults, though the majority of use cases work correctly.
|
||||
|
||||
- SVD with dynamic shapes: SVD on inputs with dynamic/symbolic shapes might result in an error.
|
||||
SVD with static shapes is unaffected.
|
||||
|
||||
- QR decomposition with symbolic shapes: QR decomposition operations may fail when using
|
||||
symbolic/dynamic shapes in shape polymorphic contexts.
|
||||
|
||||
- Pallas kernels: Specific advanced Pallas kernels may exhibit variations in
|
||||
numerical output or resource usage. These are actively reviewed as part of
|
||||
Pallas's experimental development.
|
||||
@@ -1,275 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: llama.cpp compatibility
|
||||
:keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
llama.cpp compatibility
|
||||
********************************************************************************
|
||||
|
||||
`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework
|
||||
for Large Language Model (LLM) inference that runs on both central processing units
|
||||
(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing
|
||||
a simple, dependency-free setup.
|
||||
|
||||
The framework supports multiple quantization options, from 1.5-bit to 8-bit integers,
|
||||
to accelerate inference and reduce memory usage. Originally built as a CPU-first library,
|
||||
llama.cpp is easy to integrate with other programming environments and is widely
|
||||
adopted across diverse platforms, including consumer devices.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`__ repository, which differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.
|
||||
|
||||
- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`,
|
||||
which include ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__
|
||||
for additional context.
|
||||
|
||||
.. _llama-cpp-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest available llama.cpp versions from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. important::
|
||||
|
||||
Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
|
||||
|
||||
- Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
|
||||
- Server: This image only includes the server executable file.
|
||||
- Light: This image only includes the main executable file.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Full Docker
|
||||
- Server Docker
|
||||
- Light Docker
|
||||
- llama.cpp
|
||||
- ROCm
|
||||
- Ubuntu
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_full/images/sha256-a94f0c7a598cc6504ff9e8371c016d7a2f93e69bf54a36c870f9522567201f10g"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server/images/sha256-be175932c3c96e882dfbc7e20e0e834f58c89c2925f48b222837ee929dfc47ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_light/images/sha256-d8ba0c70603da502c879b1f8010b439c8e7fa9f6cbdac8bbbbbba97cb41ebc9e"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_full/images/sha256-37582168984f25dce636cc7288298e06d94472ea35f65346b3541e6422b678ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_server/images/sha256-7e70578e6c3530c6591cc2c26da24a9ee68a20d318e12241de93c83224f83720"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_light/images/sha256-9a5231acf88b4a229677bc2c636ea3fe78a7a80f558bd80910b919855de93ad5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_full/images/sha256-5960fc850024a8a76451f9eaadd89b7e59981ae9f393b407310c1ddf18892577"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_server/images/sha256-1b79775d9f546065a6aaf9ca426e1dd4ed4de0b8f6ee83687758cc05af6538e6"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_light/images/sha256-8f863c4c2857ae42bebd64e4f1a0a1e7cc3ec4503f243e32b4a4dcad070ec361"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_full/images/sha256-888879b3ee208f9247076d7984524b8d1701ac72611689e89854a1588bec9867"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_server/images/sha256-90e4ff99a66743e33fd00728cd71a768588e5f5ef355aaa196669fe65ac70672"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_light/images/sha256-bd447a049939cb99054f8fbf3f2352870fe906a75e2dc3339c845c08b9c53f9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_full/images/sha256-5b3a1bc4889c1fcade434b937fbf9cc1c22ff7dc0317c130339b0c9238bc88c4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_server/images/sha256-5228ff99d0f627a9032d668f4381b2e80dc1e301adc3e0821f26d8354b175271"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_light/images/sha256-b12723b332a826a89b7252dddf868cbe4d1a869562fc4aa4032f59e1a683b968"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_full/images/sha256-cd6e21a6a73f59b35dd5309b09dd77654a94d783bf13a55c14eb8dbf8e9c2615"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_server/images/sha256-c2b4689ab2c47e6626e8fea22d7a63eb03d47c0fde9f5ef8c9f158d15c423e58"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_light/images/sha256-1acc28f29ed87db9cbda629cb29e1989b8219884afe05f9105522be929e94da4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_full/images/sha256-2f8ae8a44510d96d52dea6cb398b224f7edeb7802df7ec488c6f63d206b3cdc9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_server/images/sha256-fece497ff9f4a28b12f645de52766941da8ead8471aa1ea84b61d4b4568e51f2"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_light/images/sha256-3e14352fa6f8c6128b23cf9342531c20dbfb522550b626e09d83b260a1947022"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_full/images/sha256-80763062ef0bec15038c35fd01267f1fc99a5dd171d4b48583cc668b15efad69"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_server/images/sha256-db2a6c957555ed83b819bbc54aea884a93192da0fb512dae63d32e0dc4e8ab8f"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_light/images/sha256-c6dbb07cc655fb079d5216e4b77451cb64a9daa0585d23b6fb8b32cb22021197"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- 24.04
|
||||
- MI300X, MI210
|
||||
|
||||
.. _llama-cpp-key-rocm-libraries:
|
||||
|
||||
Key ROCm libraries for llama.cpp
|
||||
================================================================================
|
||||
|
||||
llama.cpp functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers. Ensure you have the required libraries for
|
||||
your corresponding ROCm version.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- ROCm 7.0.0 version
|
||||
- ROCm 6.4.x version
|
||||
- Purpose
|
||||
- Usage
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- 3.0.0
|
||||
- 2.4.0
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations such as matrix multiplication, matrix-vector
|
||||
products, and tensor contractions. Utilized in both dense and batched
|
||||
linear algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- 1.0.0
|
||||
- 0.12.0
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
|
||||
kernels where possible.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||
- 2.0.0
|
||||
- 1.7.0
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Can be used to enhance the flash attention performance on AMD compute, by enabling
|
||||
the flag during compile time.
|
||||
|
||||
.. _llama-cpp-uses-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||
|
||||
- Plain C/C++ implementation with no external dependencies
|
||||
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||
|
||||
llama.cpp is also used in a range of real-world applications, including:
|
||||
|
||||
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||
- Various other AI applications use llama.cpp as their inference engine;
|
||||
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||
|
||||
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__
|
||||
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||
AMD Instinct GPUs within the ROCm ecosystem.
|
||||
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
|
||||
of the ``ROCm/llama.cpp`` Docker image.
|
||||
@@ -1,104 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Megablocks compatibility
|
||||
:keywords: GPU, megablocks, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
Megablocks compatibility
|
||||
********************************************************************************
|
||||
|
||||
`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library
|
||||
for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training.
|
||||
The core of the system is efficient "dropless-MoE" and standard MoE layers.
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM
|
||||
<https://github.com/stanford-futuredata/Megatron-LM>`__,
|
||||
where data and pipeline parallel training of MoEs is supported.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks
|
||||
<https://github.com/ROCm/megablocks>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
|
||||
- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`,
|
||||
which includes ROCm, Megablocks, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__
|
||||
for additional context.
|
||||
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available Megablocks version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Megablocks
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- MI300X
|
||||
|
||||
Supported models and features with ROCm 6.3.0
|
||||
================================================================================
|
||||
|
||||
This section summarizes the Megablocks features supported by ROCm.
|
||||
|
||||
* Distributed Pre-training
|
||||
* Activation Checkpointing and Recomputation
|
||||
* Distributed Optimizer
|
||||
* Mixture-of-Experts
|
||||
* dropless-Mixture-of-Experts
|
||||
|
||||
.. _megablocks-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
@@ -1,498 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
PyTorch compatibility
|
||||
********************************************************************************
|
||||
|
||||
`PyTorch <https://pytorch.org/>`__ is an open-source tensor library designed for
|
||||
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
|
||||
|
||||
PyTorch provides two high-level features:
|
||||
|
||||
- Tensor computation (like NumPy) with strong GPU acceleration
|
||||
|
||||
- Deep neural networks built on a tape-based autograd system (rapid computation
|
||||
of multiple partial derivatives or gradients)
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository.
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency:
|
||||
|
||||
- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch
|
||||
<https://github.com/ROCm/pytorch>`__ repository, which differs from the
|
||||
`https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.
|
||||
|
||||
- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`,
|
||||
which include ROCm, PyTorch, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or
|
||||
`Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.
|
||||
|
||||
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
||||
This approach allows PyTorch to support ROCm without requiring manual code
|
||||
modifications. For more information, see :doc:`HIPIFY <hipify:index>`.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
|
||||
.. _pytorch-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* :doc:`Using ROCm for AI: training a model </how-to/rocm-for-ai/training/benchmark-docker/pytorch-training>`
|
||||
guides how to leverage the ROCm platform for training AI models. It covers the
|
||||
steps, tools, and best practices for optimizing training workflows on AMD GPUs
|
||||
using PyTorch features.
|
||||
|
||||
* :doc:`Single-GPU fine-tuning and inference </how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates how to use the ROCm platform for the fine-tuning
|
||||
and inference of machine learning models, particularly large language models
|
||||
(LLMs), on systems with a single GPU. This topic provides a detailed guide for
|
||||
setting up, optimizing, and executing fine-tuning and inference workflows in
|
||||
such environments.
|
||||
|
||||
* :doc:`Multi-GPU fine-tuning and inference optimization </how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference>`
|
||||
describes and demonstrates the fine-tuning and inference of machine learning
|
||||
models on systems with multiple GPUs.
|
||||
|
||||
* The :doc:`Instinct MI300X workload optimization guide </how-to/rocm-for-ai/inference-optimization/workload>`
|
||||
provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
|
||||
GPU using ROCm. This guide helps users achieve optimal performance for
|
||||
deep learning and other high-performance computing tasks on the MI300X
|
||||
GPU.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads. It outlines the
|
||||
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
||||
GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`__.
|
||||
|
||||
.. _pytorch-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
|
||||
To find the right image tag, see the :ref:`PyTorch on ROCm installation
|
||||
documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
|
||||
available ``rocm/pytorch`` images.
|
||||
|
||||
Key ROCm libraries for PyTorch
|
||||
================================================================================
|
||||
|
||||
PyTorch functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`__
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||
and ``torch.nn.MultiheadAttention``.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations such as matrix multiplication, matrix-vector
|
||||
products, and tensor contractions. Utilized in both dense and batched
|
||||
linear algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- Accelerates operations such as ``torch.matmul``, ``torch.mm``, and the
|
||||
matrix multiplications used in convolutional and linear layers.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations such as ``torch.sum``, ``torch.cumsum``,
|
||||
``torch.sort`` irregular shapes often involve scanning, sorting, and
|
||||
filtering, which hipCUB handles efficiently.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
- Used in functions like the ``torch.fft`` module.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`__
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
- The ``torch.rand``, ``torch.randn``, and stochastic layers like
|
||||
``torch.nn.Dropout`` rely on hipRAND.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
- Supports functions like ``torch.linalg.solve``,
|
||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`__
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
- Sparse tensor operations ``torch.sparse``.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`__
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||
computing.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||
networks (RNNs), and other layers. Used in operations like
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`__
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
- Speeds up inference models and executes ONNX models for
|
||||
compatibility with other frameworks.
|
||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`__
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||
and ``torchvision`` workflows.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`__
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`__
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||
and ``torch.distributed``.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`__
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||
``torch.utils.data`` and ``torchvision``.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`__
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||
``torchvision`` data load workloads to speed up data processing.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Utilized in backend operations for tensor computations requiring
|
||||
parallel processing.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Linear layers (``torch.nn.Linear``), convolutional layers
|
||||
(``torch.nn.Conv2d``), attention layers, general tensor operations that
|
||||
involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
|
||||
more.
|
||||
|
||||
Supported modules and data types
|
||||
================================================================================
|
||||
|
||||
The following section outlines the supported data types, modules, and domain
|
||||
libraries available in PyTorch on ROCm.
|
||||
|
||||
Supported data types
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The tensor data type is specified using the ``dtype`` attribute or argument.
|
||||
PyTorch supports many data types for different use cases.
|
||||
|
||||
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`__
|
||||
single data types:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
* - ``torch.float8_e4m3fn``
|
||||
- 8-bit floating point, e4m3
|
||||
* - ``torch.float8_e5m2``
|
||||
- 8-bit floating point, e5m2
|
||||
* - ``torch.float16`` or ``torch.half``
|
||||
- 16-bit floating point
|
||||
* - ``torch.bfloat16``
|
||||
- 16-bit floating point
|
||||
* - ``torch.float32`` or ``torch.float``
|
||||
- 32-bit floating point
|
||||
* - ``torch.float64`` or ``torch.double``
|
||||
- 64-bit floating point
|
||||
* - ``torch.complex32`` or ``torch.chalf``
|
||||
- 32-bit complex numbers
|
||||
* - ``torch.complex64`` or ``torch.cfloat``
|
||||
- 64-bit complex numbers
|
||||
* - ``torch.complex128`` or ``torch.cdouble``
|
||||
- 128-bit complex numbers
|
||||
* - ``torch.uint8``
|
||||
- 8-bit integer (unsigned)
|
||||
* - ``torch.uint16``
|
||||
- 16-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.uint32``
|
||||
- 32-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.uint64``
|
||||
- 64-bit integer (unsigned);
|
||||
Not natively supported in ROCm
|
||||
* - ``torch.int8``
|
||||
- 8-bit integer (signed)
|
||||
* - ``torch.int16`` or ``torch.short``
|
||||
- 16-bit integer (signed)
|
||||
* - ``torch.int32`` or ``torch.int``
|
||||
- 32-bit integer (signed)
|
||||
* - ``torch.int64`` or ``torch.long``
|
||||
- 64-bit integer (signed)
|
||||
* - ``torch.bool``
|
||||
- Boolean
|
||||
* - ``torch.quint8``
|
||||
- Quantized 8-bit integer (unsigned)
|
||||
* - ``torch.qint8``
|
||||
- Quantized 8-bit integer (signed)
|
||||
* - ``torch.qint32``
|
||||
- Quantized 32-bit integer (signed)
|
||||
* - ``torch.quint4x2``
|
||||
- Quantized 4-bit integer (unsigned)
|
||||
|
||||
.. note::
|
||||
|
||||
Unsigned types, except ``uint8``, have limited support in eager mode. They
|
||||
primarily exist to assist usage with ``torch.compile``.
|
||||
|
||||
See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
|
||||
native hardware support of data types.
|
||||
|
||||
Supported modules
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
|
||||
``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
|
||||
``torch.backends.cudnn``), their descriptions, and usage, please refer directly
|
||||
to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.
|
||||
|
||||
Core PyTorch functionality on ROCm includes tensor operations, neural network
|
||||
layers, automatic differentiation, distributed training, mixed-precision
|
||||
training, compilation features, and domain-specific libraries for audio, vision,
|
||||
text processing, and more.
|
||||
|
||||
Supported domain libraries
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
|
||||
GPU acceleration that build on its core features to support specific application
|
||||
areas. The table below lists the PyTorch domain libraries that are compatible
|
||||
with ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Description
|
||||
|
||||
* - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
|
||||
- Audio and signal processing library for PyTorch. Provides utilities for
|
||||
audio I/O, signal and data processing functions, datasets, model
|
||||
implementations, and application components for audio and speech
|
||||
processing tasks.
|
||||
|
||||
**Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
|
||||
you need to explicitly move audio data (waveform tensor) to GPU using
|
||||
``.to('cuda')``.
|
||||
|
||||
* - `torchtune <https://meta-pytorch.org/torchtune/stable/index.html>`_
|
||||
- PyTorch-native library designed for fine-tuning large language models
|
||||
(LLMs). Provides supports the full fine-tuning workflow and offers
|
||||
compatibility with popular production inference systems.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
* - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
|
||||
- Computer vision library that is part of the PyTorch project. Provides
|
||||
popular datasets, model architectures, and common image transformations
|
||||
for computer vision applications.
|
||||
|
||||
* - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
|
||||
- Beta library of common modular data loading primitives for easily
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
in prototype stage.
|
||||
|
||||
* - `torchrec <https://meta-pytorch.org/torchrec/>`_
|
||||
- PyTorch domain library for common sparsity and parallelism primitives
|
||||
needed for large-scale recommender systems, enabling authors to train
|
||||
models with large embedding tables shared across many GPUs.
|
||||
|
||||
**Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
|
||||
acceleration is provided through the underlying PyTorch framework and
|
||||
ROCm library integration.
|
||||
|
||||
* - `torchserve <https://docs.pytorch.org/serve/>`_
|
||||
- Performant, flexible and easy-to-use tool for serving PyTorch models in
|
||||
production, providing features for model management, batch processing,
|
||||
and scalable deployment.
|
||||
|
||||
**Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
|
||||
actively maintained. Last official release is sent out with PyTorch 2.4.
|
||||
|
||||
* - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
|
||||
- Open-source, Python-first Reinforcement Learning library for PyTorch
|
||||
with a focus on high modularity and good runtime performance, providing
|
||||
low and high-level RL abstractions and reusable functionals for cost
|
||||
functions, returns, and data processing.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
* - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
|
||||
- Dictionary-like class that simplifies operations on batches of tensors,
|
||||
enhancing code readability, compactness, and modularity by abstracting
|
||||
tailored operations and reducing errors through automatic operation
|
||||
dispatching.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
|
||||
================================================================================
|
||||
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b.
|
||||
|
||||
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later.
|
||||
|
||||
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization.
|
||||
|
||||
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations.
|
||||
|
||||
- Improved tensor.item() performance by removing redundant synchronization.
|
||||
|
||||
- Enhanced performance for element-wise operations and reduction kernels.
|
||||
|
||||
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components.
|
||||
|
||||
- Resolved device error in Inductor when using CUDA graph trees with HIP.
|
||||
|
||||
- Corrected logsumexp scaling in AOTriton-based SDPA implementation.
|
||||
|
||||
- Added stream graph capture status validation in memory copy synchronization functions.
|
||||
|
||||
Key features and enhancements for PyTorch 2.8 with ROCm 7.1
|
||||
================================================================================
|
||||
|
||||
- MIOpen deep learning optimizations: Further optimized NHWC BatchNorm feature.
|
||||
|
||||
- Added float8 support for the DeepSpeed extension, allowing for decreased
|
||||
memory footprint and increased throughput in training and inference workloads.
|
||||
|
||||
- ``torch.nn.functional.scaled_dot_product_attention`` now calling optimized
|
||||
flash attention kernel automatically.
|
||||
|
||||
Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
|
||||
TunableOp operations, improved offline tuning for ScaledGEMM operations,
|
||||
submatrix offline tuning capabilities, and better logging for BLAS operations
|
||||
without bias vectors.
|
||||
|
||||
- Expanded GPU architecture support: Provides optimized support for newer GPU
|
||||
architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
|
||||
selection, along with improvements for gfx950 and gfx1100 Series GPUs.
|
||||
|
||||
- Advanced Triton Integration: AOTriton 0.10b introduces official support for
|
||||
gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
|
||||
gfx1150, and gfx1200.
|
||||
|
||||
- Improved element-wise kernel performance: Delivers enhanced vectorized
|
||||
element-wise kernels with better support for heterogeneous tensor types and
|
||||
optimized input vectorization for tensors with mixed data types.
|
||||
|
||||
- MIOpen deep learning optimizations: Enables NHWC BatchNorm by default on
|
||||
ROCm 7.0+, provides ``maxpool`` forward and backward performance improvements
|
||||
targeting ResNet scenarios, and includes updated launch configurations for
|
||||
better performance.
|
||||
|
||||
- Enhanced memory and tensor operations: Features fixes for in-place ``aten``
|
||||
sum operations with specialized templated kernels, improved 3D tensor
|
||||
performance with NHWC format, and better handling of memory-bound matrix
|
||||
multiplication operations.
|
||||
|
||||
- Robust testing and quality improvements: Includes comprehensive test suite
|
||||
updates with improved tolerance handling for Navi3x architectures, generalized
|
||||
ROCm-specific test conditions, and enhanced unit test coverage for Flash
|
||||
Attention and Memory Efficient operations.
|
||||
|
||||
- Composable Kernel (CK) updates: Features updated CK submodule integration with
|
||||
the latest optimizations and performance improvements for core mathematical
|
||||
operations.
|
||||
|
||||
- Development and debugging enhancements: Includes improved source handling for
|
||||
dynamic compilation, better error handling for atomic operations, and enhanced
|
||||
state checking for trace operations.
|
||||
|
||||
- Integrate APEX fused layer normalization, which can have positive impact on
|
||||
text-to-video models.
|
||||
|
||||
- Integrate APEX distributed fused LAMB and distributed fused ADAM, which can
|
||||
have positive impact on BERT-L and Llama2-SFT.
|
||||
|
||||
- FlashAttention v3 has been integrated for AMD GPUs.
|
||||
|
||||
- `Pytorch C++ extensions <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_
|
||||
provide a mechanism for compiling custom operations that can be used during
|
||||
network training or inference. For AMD platforms, ``amdclang++`` has been
|
||||
validated as the supported compiler for building these extensions.
|
||||
|
||||
Known issues and notes for PyTorch 2.7/2.8 with ROCm 7.0 and ROCm 7.1
|
||||
================================================================================
|
||||
|
||||
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
|
||||
``matmul.allow_bf16_reduced_precision_reduction`` options under
|
||||
``torch.backends.cuda`` are not supported. As a result,
|
||||
reduced-precision reductions using FP16 or BF16 accumulation types are not
|
||||
available.
|
||||
@@ -1,114 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Ray compatibility
|
||||
:keywords: GPU, Ray, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Ray compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Ray is a unified framework for scaling AI and Python applications from your laptop
|
||||
to a full cluster, without changing your code. Ray consists of `a core distributed
|
||||
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`__ and a set of
|
||||
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`__ for
|
||||
simplifying machine learning computations.
|
||||
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`__ repository, which differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.
|
||||
|
||||
- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`,
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
.. _ray-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.51.1_rocm7.0.0_ubuntu22.04_py3.12_pytorch2.9.0/images/sha256-a02f6766b4ba406f88fd7e85707ec86c04b569834d869a08043ec9bcbd672168"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.51.1 <https://github.com/ROCm/ray/tree/release/2.51.1>`__
|
||||
- 2.9.0a0+git1c57644
|
||||
- 22.04
|
||||
- `3.12.12 <https://www.python.org/downloads/release/python-31212/>`__
|
||||
- MI300X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`__
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- MI300X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm
|
||||
Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog provides an overview of Volcano Engine Reinforcement Learning (verl)
|
||||
for large language models (LLMs) and discusses its benefits in large-scale
|
||||
reinforcement learning from human feedback (RLHF). It uses Ray as part of a
|
||||
hybrid orchestration engine to schedule and coordinate training and inference
|
||||
tasks in parallel, enabling optimized resource utilization and potential overlap
|
||||
between these phases. This dynamic resource allocation strategy significantly
|
||||
improves overall system efficiency. The blog presents verl’s performance results,
|
||||
focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X
|
||||
GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and
|
||||
accelerate your RLHF training with ROCm-optimized performance.
|
||||
|
||||
* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
|
||||
blog post describes key use cases such as training and inference for large language models (LLMs),
|
||||
model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale
|
||||
workloads using Ray in the ROCm environment.
|
||||
|
||||
For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support
|
||||
topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__
|
||||
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/ray-history` to find documentation for previous releases
|
||||
of the ``ROCm/ray`` Docker image.
|
||||
@@ -1,116 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Stanford Megatron-LM compatibility
|
||||
:keywords: Stanford, Megatron-LM, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
Stanford Megatron-LM compatibility
|
||||
********************************************************************************
|
||||
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed
|
||||
by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_.
|
||||
It is designed to train massive transformer-based language models efficiently by model
|
||||
and data parallelism.
|
||||
|
||||
It provides efficient tensor, pipeline, and sequence-based model parallelism for
|
||||
pre-training transformer-based language models such as GPT (Decoder Only), BERT
|
||||
(Encoder Only), and T5 (Encoder-Decoder).
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM
|
||||
<https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
|
||||
- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`,
|
||||
which includes ROCm, Stanford Megatron-LM, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__
|
||||
for additional context.
|
||||
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i> rocm/stanford-megatron-lm</a>
|
||||
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- MI300X
|
||||
|
||||
Supported models and features with ROCm 6.3.0
|
||||
================================================================================
|
||||
|
||||
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
|
||||
|
||||
Models:
|
||||
|
||||
* BERT
|
||||
* GPT
|
||||
* T5
|
||||
* ICT
|
||||
|
||||
Features:
|
||||
|
||||
* Distributed Pre-training
|
||||
* Activation Checkpointing and Recomputation
|
||||
* Distributed Optimizer
|
||||
* Mixture-of-Experts
|
||||
|
||||
.. _megatron-lm-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:
|
||||
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
@@ -1,435 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
TensorFlow compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
|
||||
solving machine learning, deep learning, and AI problems. It can solve many
|
||||
problems across different sectors and industries, but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular deep
|
||||
learning frameworks and is very active in open-source development.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream
|
||||
<https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the
|
||||
`https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
|
||||
|
||||
- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`,
|
||||
which include ROCm, TensorFlow, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
|
||||
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
||||
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
|
||||
fixes, updates, and support for the latest ROCm versions.
|
||||
|
||||
.. _tensorflow-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD provides preconfigured Docker images with TensorFlow and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/tensorflow>`__ and are the
|
||||
recommended way to get started with deep learning with TensorFlow on ROCm.
|
||||
|
||||
To find the right image tag, see the :ref:`TensorFlow on ROCm installation
|
||||
documentation <rocm-install-on-linux:tensorflow-docker-support>` for a list of
|
||||
available ``rocm/tensorflow`` images.
|
||||
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
===============================================================================
|
||||
|
||||
TensorFlow depends on multiple components and the supported features of those
|
||||
components can affect the TensorFlow ROCm supported feature set. The versions
|
||||
in the following table refer to the first TensorFlow version where the ROCm
|
||||
library was introduced as a dependency. The versions described
|
||||
are available in ROCm :version:`rocm_version`.
|
||||
|
||||
.. list-table::
|
||||
:widths: 25, 10, 35, 30
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Used in
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
||||
other matrix multiplications commonly used in neural network layers.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- Extends hipBLAS with additional optimizations like fused kernels and
|
||||
integer tensor cores.
|
||||
- Optimizes matrix multiplications and linear algebra operations used in
|
||||
layers like dense, convolutional, and RNNs in TensorFlow.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
||||
and other tensor operations in TensorFlow, especially those involving
|
||||
scanning, sorting, and filtering.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
||||
- Used for operations like signal processing, image filtering, and
|
||||
certain types of neural networks requiring FFT-based transformations.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
||||
systems.
|
||||
- Optimizes linear algebra functions such as solving systems of linear
|
||||
equations, often used in optimization and training tasks.
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Optimizes sparse matrix operations for efficient computations on sparse
|
||||
data.
|
||||
- Accelerates sparse matrix operations in models with sparse weight
|
||||
matrices or activations, commonly used in neural networks.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Provides optimized deep learning primitives such as convolutions,
|
||||
pooling,
|
||||
normalization, and activation functions.
|
||||
- Speeds up convolutional neural networks (CNNs) and other layers. Used
|
||||
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
||||
``tf.nn.lstm_cell``.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
||||
Handles communication in multi-GPU setups.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
- Reduction operations like ``tf.reduce_sum``, ``tf.cumsum`` for computing
|
||||
the cumulative sum of elements along a given axis or ``tf.unique`` to
|
||||
finds unique elements in a tensor can use rocThrust.
|
||||
|
||||
Supported and unsupported features
|
||||
===============================================================================
|
||||
|
||||
The following section maps supported data types and GPU-accelerated TensorFlow
|
||||
features to their minimum supported ROCm and TensorFlow versions.
|
||||
|
||||
Data types
|
||||
---------------
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or
|
||||
argument, and TensorFlow supports a wide range of data types for different use
|
||||
cases.
|
||||
|
||||
The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`__
|
||||
are as follows:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Data type
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``bfloat16``
|
||||
- 16-bit bfloat (brain floating point).
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``bool``
|
||||
- Boolean.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``complex128``
|
||||
- 128-bit complex.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``complex64``
|
||||
- 64-bit complex.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``double``
|
||||
- 64-bit (double precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float16``
|
||||
- 16-bit (half precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float32``
|
||||
- 32-bit (single precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``float64``
|
||||
- 64-bit (double precision) floating-point.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``half``
|
||||
- 16-bit (half precision) floating-point.
|
||||
- 2.0.0
|
||||
- 2.0
|
||||
* - ``int16``
|
||||
- Signed 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int32``
|
||||
- Signed 32-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int64``
|
||||
- Signed 64-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``int8``
|
||||
- Signed 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint16``
|
||||
- Signed quantized 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint32``
|
||||
- Signed quantized 32-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``qint8``
|
||||
- Signed quantized 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``quint16``
|
||||
- Unsigned quantized 16-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``quint8``
|
||||
- Unsigned quantized 8-bit integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``resource``
|
||||
- Handle to a mutable, dynamically allocated resource.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``string``
|
||||
- Variable-length string, represented as byte array.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``uint16``
|
||||
- Unsigned 16-bit (word) integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``uint32``
|
||||
- Unsigned 32-bit (dword) integer.
|
||||
- 1.5.0
|
||||
- 1.7
|
||||
* - ``uint64``
|
||||
- Unsigned 64-bit (qword) integer.
|
||||
- 1.5.0
|
||||
- 1.7
|
||||
* - ``uint8``
|
||||
- Unsigned 8-bit (byte) integer.
|
||||
- 1.0.0
|
||||
- 1.7
|
||||
* - ``variant``
|
||||
- Data of arbitrary type (known at runtime).
|
||||
- 1.4.0
|
||||
- 1.7
|
||||
|
||||
Features
|
||||
---------------
|
||||
|
||||
This table provides an overview of key features in TensorFlow and their
|
||||
availability in ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``tf.linalg`` (Linear Algebra)
|
||||
- Operations for matrix and tensor computations, such as
|
||||
``tf.linalg.matmul`` (matrix multiplication), ``tf.linalg.inv``
|
||||
(matrix inversion) and ``tf.linalg.cholesky`` (Cholesky decomposition).
|
||||
These leverage GPUs for high-performance linear algebra operations.
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.nn`` (Neural Network Operations)
|
||||
- GPU-accelerated building blocks for deep learning models, such as 2D
|
||||
convolutions with ``tf.nn.conv2d``, max pooling operations with
|
||||
``tf.nn.max_pool``, activation functions like ``tf.nn.relu`` or softmax
|
||||
for output layers with ``tf.nn.softmax``.
|
||||
- 1.0
|
||||
- 1.8.2
|
||||
* - ``tf.image`` (Image Processing)
|
||||
- GPU-accelerated functions for image preprocessing and augmentations,
|
||||
such as resize images with ``tf.image.resize``, flip images horizontally
|
||||
with ``tf.image.flip_left_right`` and adjust image brightness randomly
|
||||
with ``tf.image.random_brightness``.
|
||||
- 1.1
|
||||
- 1.8.2
|
||||
* - ``tf.keras`` (High-Level API)
|
||||
- GPU acceleration for Keras layers and models, including dense layers
|
||||
(``tf.keras.layers.Dense``), convolutional layers
|
||||
(``tf.keras.layers.Conv2D``) and recurrent layers
|
||||
(``tf.keras.layers.LSTM``).
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.math`` (Mathematical Operations)
|
||||
- GPU-accelerated mathematical operations, such as sum across dimensions
|
||||
with ``tf.math.reduce_sum``, elementwise exponentiation with
|
||||
``tf.math.exp`` and sigmoid activation (``tf.math.sigmoid``).
|
||||
- 1.5
|
||||
- 1.8.2
|
||||
* - ``tf.signal`` (Signal Processing)
|
||||
- Functions for spectral analysis and signal transformations.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.data`` (Data Input Pipeline)
|
||||
- GPU-accelerated data preprocessing for efficient input pipelines,
|
||||
Prefetching with ``tf.data.experimental.AUTOTUNE``. GPU-enabled
|
||||
transformations like map and batch.
|
||||
- 1.4
|
||||
- 1.8.2
|
||||
* - ``tf.distribute`` (Distributed Training)
|
||||
- Enabling to scale computations across multiple devices on a single
|
||||
machine or across multiple machines.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.random`` (Random Number Generation)
|
||||
- GPU-accelerated random number generation
|
||||
- 1.12
|
||||
- 1.9.2
|
||||
* - ``tf.TensorArray`` (Dynamic Array Operations)
|
||||
- Enables dynamic tensor manipulation on GPUs.
|
||||
- 1.0
|
||||
- 1.8.2
|
||||
* - ``tf.sparse`` (Sparse Tensor Operations)
|
||||
- GPU-accelerated sparse matrix manipulations.
|
||||
- 1.9
|
||||
- 1.9.0
|
||||
* - ``tf.experimental.numpy``
|
||||
- GPU-accelerated NumPy-like API for numerical computations.
|
||||
- 2.4
|
||||
- 4.1.1
|
||||
* - ``tf.RaggedTensor``
|
||||
- Handling of variable-length sequences and ragged tensors with GPU
|
||||
support.
|
||||
- 1.13
|
||||
- 2.1
|
||||
* - ``tf.function`` with XLA (Accelerated Linear Algebra)
|
||||
- Enable GPU-accelerated functions in optimization.
|
||||
- 1.14
|
||||
- 2.4
|
||||
* - ``tf.quantization``
|
||||
- Quantized operations for inference, accelerated on GPUs.
|
||||
- 1.12
|
||||
- 1.9.2
|
||||
|
||||
Distributed library features
|
||||
-----------------------------------
|
||||
|
||||
Enables developers to scale computations across multiple devices on a single machine or
|
||||
across multiple machines.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Feature
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
- Since ROCm
|
||||
* - ``MultiWorkerMirroredStrategy``
|
||||
- Synchronous training across multiple workers using mirrored variables.
|
||||
- 2.0
|
||||
- 3.0
|
||||
* - ``MirroredStrategy``
|
||||
- Synchronous training across multiple GPUs on one machine.
|
||||
- 1.5
|
||||
- 2.5
|
||||
* - ``TPUStrategy``
|
||||
- Efficiently trains models on Google TPUs.
|
||||
- 1.9
|
||||
- ❌
|
||||
* - ``ParameterServerStrategy``
|
||||
- Asynchronous training using parameter servers for variable management.
|
||||
- 2.1
|
||||
- 4.0
|
||||
* - ``CentralStorageStrategy``
|
||||
- Keeps variables on a single device and performs computation on multiple
|
||||
devices.
|
||||
- 2.3
|
||||
- 4.1
|
||||
* - ``CollectiveAllReduceStrategy``
|
||||
- Synchronous training across multiple devices and hosts.
|
||||
- 1.14
|
||||
- 3.5
|
||||
* - Distribution Strategies API
|
||||
- High-level API to simplify distributed training configuration and
|
||||
execution.
|
||||
- 1.10
|
||||
- 3.0
|
||||
|
||||
Unsupported TensorFlow features
|
||||
===============================================================================
|
||||
|
||||
The following are GPU-accelerated TensorFlow features not currently supported by
|
||||
ROCm.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Feature
|
||||
- Description
|
||||
- Since TensorFlow
|
||||
* - Mixed Precision with TF32
|
||||
- Mixed precision with TF32 is used for matrix multiplications,
|
||||
convolutions, and other linear algebra operations, particularly in
|
||||
deep learning workloads like CNNs and transformers.
|
||||
- 2.4
|
||||
* - ``tf.distribute.TPUStrategy``
|
||||
- Efficiently trains models on Google TPUs.
|
||||
- 1.9
|
||||
|
||||
Use cases and recommendations
|
||||
===============================================================================
|
||||
|
||||
* The `Training a Neural Collaborative Filtering (NCF) Recommender on an AMD
|
||||
GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`__
|
||||
blog post discusses training an NCF recommender system using TensorFlow. It
|
||||
explains how NCF improves traditional collaborative filtering methods by
|
||||
leveraging neural networks to model non-linear user-item interactions. The
|
||||
post outlines the implementation using the recommenders library, focusing on
|
||||
the use of implicit data (for example, user interactions like viewing or
|
||||
purchasing) and how it addresses challenges like the lack of negative values.
|
||||
|
||||
* The `Creating a PyTorch/TensorFlow code environment on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`__
|
||||
blog post provides instructions for creating a machine learning environment
|
||||
for PyTorch and TensorFlow on AMD GPUs using ROCm. It covers steps like
|
||||
installing the libraries, cloning code repositories, installing dependencies,
|
||||
and troubleshooting potential issues with CUDA-based code. Additionally, it
|
||||
explains how to HIPify code (port CUDA code to HIP) and manage Docker images
|
||||
for a better experience on AMD GPUs. This guide aims to help data scientists
|
||||
and ML practitioners adapt their code for AMD GPUs.
|
||||
|
||||
For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`__.
|
||||
@@ -1,118 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: verl compatibility
|
||||
:keywords: GPU, verl, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
verl compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)
|
||||
is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model
|
||||
that makes it easy to define and run complex post-training dataflows efficiently.
|
||||
|
||||
Its modular APIs separate computation from data, allowing smooth integration with other frameworks.
|
||||
It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
|
||||
verl achieves high training and generation throughput by building on existing LLM frameworks.
|
||||
Its 3D-HybridEngine reduces memory use and communication overhead when switching between training
|
||||
and inference, improving overall performance.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl
|
||||
<https://github.com/ROCm/verl>`__ repository, which differs from the
|
||||
`https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
|
||||
|
||||
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
|
||||
which includes ROCm, verl, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
|
||||
for additional context.
|
||||
|
||||
.. _verl-docker-compat:
|
||||
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- PyTorch
|
||||
- Python
|
||||
- vllm
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.6.0.amd0_rocm7.0_vllm0.11.0.dev/images/sha256-f70a3ebc94c1f66de42a2fcc3f8a6a8d6d0881eb0e65b6958d7d6d24b3eecb0d"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `0.6.0 <https://github.com/volcengine/verl/releases/tag/v0.6.0>`__
|
||||
- 22.04
|
||||
- `2.9.0 <https://github.com/ROCm/pytorch/tree/release/2.9-rocm7.x-gfx115x>`__
|
||||
- `3.12.11 <https://www.python.org/downloads/release/python-31211/>`__
|
||||
- `0.11.0 <https://github.com/vllm-project/vllm/releases/tag/v0.11.0>`__
|
||||
- MI300X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__
|
||||
- `0.3.0.post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`__
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`__
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`__
|
||||
- MI300X
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
Supported modules with verl on ROCm
|
||||
===============================================================================
|
||||
|
||||
The following GPU-accelerated modules are supported with verl on ROCm:
|
||||
|
||||
- ``FSDP``: Training engine
|
||||
- ``vllm``: Inference engine
|
||||
|
||||
.. _verl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The benefits of verl in large-scale reinforcement learning from human feedback
|
||||
(RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD
|
||||
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
|
||||
(verl) framework integrates with the AMD ROCm platform to optimize training on
|
||||
AMD Instinct™ GPUs. The guide details the process of building a Docker image,
|
||||
setting up single-node and multi-node training environments, and highlights
|
||||
performance benchmarks demonstrating improved throughput and convergence accuracy.
|
||||
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
|
||||
facilitating efficient RLHF training workflows.
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/verl-history` to find documentation for previous releases
|
||||
of the ``ROCm/verl`` Docker image.
|
||||
156
docs/conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst
Normal file
@@ -0,0 +1,156 @@
|
||||
.. meta::
|
||||
:description: How ROCm uses PCIe atomics
|
||||
:keywords: PCIe, PCIe atomics, atomics, BAR memory, AMD, ROCm
|
||||
|
||||
*****************************************************************************
|
||||
How ROCm uses PCIe atomics
|
||||
*****************************************************************************
|
||||
|
||||
ROCm PCIe feature and overview of BAR memory
|
||||
================================================================
|
||||
|
||||
ROCm is an extension of HSA platform architecture, so it shares the queuing model, memory model,
|
||||
signaling and synchronization protocols. Platform atomics are integral to perform queuing and
|
||||
signaling memory operations where there may be multiple-writers across CPU and GPU agents.
|
||||
|
||||
The full list of HSA system architecture platform requirements are here:
|
||||
`HSA Sys Arch Features <http://hsafoundation.com/wp-content/uploads/2021/02/HSA-SysArch-1.2.pdf>`_.
|
||||
|
||||
AMD ROCm Software uses the new PCI Express 3.0 (Peripheral Component Interconnect Express [PCIe]
|
||||
3.0) features for atomic read-modify-write transactions which extends inter-processor synchronization
|
||||
mechanisms to IO to support the defined set of HSA capabilities needed for queuing and signaling
|
||||
memory operations.
|
||||
|
||||
The new PCIe atomic operations operate as completers for ``CAS`` (Compare and Swap), ``FetchADD``,
|
||||
``SWAP`` atomics. The atomic operations are initiated by the I/O device which support 32-bit, 64-bit and
|
||||
128-bit operand which target address have to be naturally aligned to operation sizes.
|
||||
|
||||
For ROCm the Platform atomics are used in ROCm in the following ways:
|
||||
|
||||
* Update HSA queue's read_dispatch_id: 64 bit atomic add used by the command processor on the
|
||||
GPU agent to update the packet ID it processed.
|
||||
* Update HSA queue's write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to
|
||||
support multi-writer queue insertions.
|
||||
* Update HSA Signals -- 64bit atomic ops are used for CPU & GPU synchronization.
|
||||
|
||||
The PCIe 3.0 atomic operations feature allows atomic transactions to be requested by, routed through
|
||||
and completed by PCIe components. Routing and completion does not require software support.
|
||||
Component support for each is detectable via the Device Capabilities 2 (DevCap2) register. Upstream
|
||||
bridges need to have atomic operations routing enabled or the atomic operations will fail even though
|
||||
PCIe endpoint and PCIe I/O devices has the capability to atomic operations.
|
||||
|
||||
To do atomic operations routing capability between two or more Root Ports, each associated Root Port
|
||||
must indicate that capability via the atomic operations routing supported bit in the DevCap2 register.
|
||||
|
||||
If your system has a PCIe Express Switch it needs to support atomic operations routing. Atomic
|
||||
operations requests are permitted only if a component's ``DEVCTL2.ATOMICOP_REQUESTER_ENABLE``
|
||||
field is set. These requests can only be serviced if the upstream components support atomic operation
|
||||
completion and/or routing to a component which does. Atomic operations routing support=1, routing
|
||||
is supported; atomic operations routing support=0, routing is not supported.
|
||||
|
||||
An atomic operation is a non-posted transaction supporting 32-bit and 64-bit address formats, there
|
||||
must be a response for Completion containing the result of the operation. Errors associated with the
|
||||
operation (uncorrectable error accessing the target location or carrying out the atomic operation) are
|
||||
signaled to the requester by setting the Completion Status field in the completion descriptor, they are
|
||||
set to to Completer Abort (CA) or Unsupported Request (UR).
|
||||
|
||||
To understand more about how PCIe atomic operations work, see
|
||||
`PCIe atomics <https://pcisig.com/specifications/pciexpress/specifications/ECN_Atomic_Ops_080417.pdf>`_
|
||||
|
||||
`Linux Kernel Patch to pci_enable_atomic_request <https://patchwork.kernel.org/project/linux-pci/patch/1443110390-4080-1-git-send-email-jay@jcornwall.me/>`_
|
||||
|
||||
There are also a number of papers which talk about these new capabilities:
|
||||
|
||||
* `Atomic Read Modify Write Primitives by Intel <https://www.intel.es/content/dam/doc/white-paper/atomic-read-modify-write-primitives-i-o-devices-paper.pdf>`_
|
||||
* `PCI express 3 Accelerator White paper by Intel <https://www.intel.sg/content/dam/doc/white-paper/pci-express3-accelerator-white-paper.pdf>`_
|
||||
* `PCIe Generation 4 Base Specification includes atomic operations <https://astralvx.com/storage/2020/11/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf>`_
|
||||
* `Xilinx PCIe Ultrascale White paper <https://docs.xilinx.com/v/u/8OZSA2V1b1LLU2rRCDVGQw>`_
|
||||
|
||||
Other I/O devices with PCIe atomics support:
|
||||
|
||||
* Mellanox ConnectX-5 InfiniBand Card
|
||||
* Cray Aries Interconnect
|
||||
* Xilinx 7 Series Devices
|
||||
|
||||
Future bus technology with richer I/O atomics operation Support
|
||||
|
||||
* GenZ
|
||||
|
||||
New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPUs
|
||||
with PCIe Generation 3.0 support.
|
||||
|
||||
* Mellanox Bluefield SOC
|
||||
* Cavium Thunder X2
|
||||
|
||||
In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU
|
||||
originates two writes to two different targets:
|
||||
|
||||
* Write to another GPU memory
|
||||
* Write to system memory to indicate transfer complete
|
||||
|
||||
They are routed off to different ends of the computer but we want to make sure the write to system
|
||||
memory to indicate transfer complete occurs AFTER P2P write to GPU has complete.
|
||||
|
||||
BAR memory overview
|
||||
----------------------------------------------------------------------------------------------------
|
||||
On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing, if so he need to set
|
||||
memory-mapped input/output (MMIO) base address (MMIOH base) and range (MMIO high size) in the BIOS.
|
||||
|
||||
In the Supermicro system in the system bios you need to see the following
|
||||
|
||||
* Advanced->PCIe/PCI/PnP configuration-\> Above 4G Decoding = Enabled
|
||||
* Advanced->PCIe/PCI/PnP Configuration-\>MMIOH Base = 512G
|
||||
* Advanced->PCIe/PCI/PnP Configuration-\>MMIO High Size = 256G
|
||||
|
||||
When we support Large Bar Capability there is a Large Bar VBIOS which also disable the IO bar.
|
||||
|
||||
For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual address.
|
||||
|
||||
* BAR0-1 registers: 64bit, prefetchable, GPU memory. 8GB or 16GB depending on Vega10 SKU. Must
|
||||
be placed < 2^44 to support P2P access from other Vega10.
|
||||
* BAR2-3 registers: 64bit, prefetchable, Doorbell. Must be placed \< 2^44 to support P2P access from
|
||||
other Vega10.
|
||||
* BAR4 register: Optional, not a boot device.
|
||||
* BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed \< 4GB.
|
||||
|
||||
Here is how our base address register (BAR) works on GFX 8 GPUs with 40 bit Physical Address Limit ::
|
||||
|
||||
11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO
|
||||
Series] (rev c1)
|
||||
|
||||
Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35
|
||||
|
||||
Flags: bus master, fast devsel, latency 0, IRQ 119
|
||||
|
||||
Memory at bf40000000 (64-bit, prefetchable) [size=256M]
|
||||
|
||||
Memory at bf50000000 (64-bit, prefetchable) [size=2M]
|
||||
|
||||
I/O ports at 3000 [size=256]
|
||||
|
||||
Memory at c7400000 (32-bit, non-prefetchable) [size=256K]
|
||||
|
||||
Expansion ROM at c7440000 [disabled] [size=128K]
|
||||
|
||||
Legend:
|
||||
|
||||
1 : GPU Frame Buffer BAR -- In this example it happens to be 256M, but typically this will be size of the
|
||||
GPU memory (typically 4GB+). This BAR has to be placed \< 2^40 to allow peer-to-peer access from
|
||||
other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed \< 2^44 to allow peer-to-peer
|
||||
access from other GFX9 AMD GPUs.
|
||||
|
||||
2 : Doorbell BAR -- The size of the BAR is typically will be \< 10MB (currently fixed at 2MB) for this
|
||||
generation GPUs. This BAR has to be placed \< 2^40 to allow peer-to-peer access from other current
|
||||
generation AMD GPUs.
|
||||
|
||||
3 : IO BAR -- This is for legacy VGA and boot device support, but since this the GPUs in this project are
|
||||
not VGA devices (headless), this is not a concern even if the SBIOS does not setup.
|
||||
|
||||
4 : MMIO BAR -- This is required for the AMD Driver SW to access the configuration registers. Since the
|
||||
reminder of the BAR available is only 1 DWORD (32bit), this is placed \< 4GB. This is fixed at 256KB.
|
||||
|
||||
5 : Expansion ROM -- This is required for the AMD Driver SW to access the GPU video-bios. This is
|
||||
currently fixed at 128KB.
|
||||
|
||||
For more information, you can review
|
||||
`Overview of Changes to PCI Express 3.0 <https://www.mindshare.com/files/resources/PCIe%203-0.pdf>`_.
|
||||
333
docs/conceptual/ai-migraphx-optimization.md
Normal file
@@ -0,0 +1,333 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="Inference optimization with MIGraphX">
|
||||
<meta name="keywords" content="Inference optimization, MIGraphX, deep-learning, MIGraphX
|
||||
installation, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# Inference optimization with MIGraphX
|
||||
|
||||
The following sections cover inferencing and introduces [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/).
|
||||
|
||||
## Inference
|
||||
|
||||
The inference is where capabilities learned during deep-learning training are put to work. It refers to using a fully trained neural network to make conclusions (predictions) on unseen data that the model has never interacted with before. Deep-learning inferencing is achieved by feeding new data, such as new images, to the network, giving the Deep Neural Network a chance to classify the image.
|
||||
|
||||
Taking our previous example of MNIST, the DNN can be fed new images of handwritten digit images, allowing the neural network to classify digits. A fully trained DNN should make accurate predictions about what an image represents, and inference cannot happen without training.
|
||||
|
||||
## MIGraphX introduction
|
||||
|
||||
MIGraphX is a graph compiler focused on accelerating the machine-learning inference that can target AMD GPUs and CPUs. MIGraphX accelerates the machine-learning models by leveraging several graph-level transformations and optimizations. These optimizations include:
|
||||
|
||||
* Operator fusion
|
||||
* Arithmetic simplifications
|
||||
* Dead-code elimination
|
||||
* Common subexpression elimination (CSE)
|
||||
* Constant propagation
|
||||
|
||||
After doing all these transformations, MIGraphX emits code for the AMD GPU by calling to MIOpen or rocBLAS or creating HIP kernels for a particular operator. MIGraphX can also target CPUs using DNNL or ZenDNN libraries.
|
||||
|
||||
MIGraphX provides easy-to-use APIs in C++ and Python to import machine models in ONNX or TensorFlow. Users can compile, save, load, and run these models using the MIGraphX C++ and Python APIs. Internally, MIGraphX parses ONNX or TensorFlow models into internal graph representation where each operator in the model gets mapped to an operator within MIGraphX. Each of these operators defines various attributes such as:
|
||||
|
||||
* Number of arguments
|
||||
* Type of arguments
|
||||
* Shape of arguments
|
||||
|
||||
After optimization passes, all these operators get mapped to different kernels on GPUs or CPUs.
|
||||
|
||||
After importing a model into MIGraphX, the model is represented as `migraphx::program`. `migraphx::program` is made up of `migraphx::module`. The program can consist of several modules, but it always has one main_module. Modules are made up of `migraphx::instruction_ref`. Instructions contain the `migraphx::op` and arguments to the operator.
|
||||
|
||||
## Installing MIGraphX
|
||||
|
||||
There are three options to get started with MIGraphX installation. MIGraphX depends on ROCm libraries; assume that the machine has ROCm installed.
|
||||
|
||||
### Option 1: installing binaries
|
||||
|
||||
To install MIGraphX on Debian-based systems like Ubuntu, use the following command:
|
||||
|
||||
```bash
|
||||
sudo apt update && sudo apt install -y migraphx
|
||||
```
|
||||
|
||||
The header files and libraries are installed under `/opt/rocm-\<version\>`, where \<version\> is the ROCm version.
|
||||
|
||||
### Option 2: building from source
|
||||
|
||||
There are two ways to build the MIGraphX sources.
|
||||
|
||||
* [Use the ROCm build tool](https://github.com/ROCm/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses `[rbuild](https://github.com/ROCm/rbuild)` to install the prerequisites and build the libraries with just one command.
|
||||
|
||||
or
|
||||
|
||||
* [Use CMake](https://github.com/ROCm/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.
|
||||
|
||||
For detailed steps on building from source and installing dependencies, refer to the following `README` file:
|
||||
|
||||
[https://github.com/ROCm/AMDMIGraphX#building-from-source](https://github.com/ROCm/AMDMIGraphX#building-from-source)
|
||||
|
||||
### Option 3: use docker
|
||||
|
||||
To use Docker, follow these steps:
|
||||
|
||||
1. The easiest way to set up the development environment is to use Docker. To build Docker from scratch, first clone the MIGraphX repository by running:
|
||||
|
||||
```bash
|
||||
git clone --recursive https://github.com/ROCm/AMDMIGraphX
|
||||
```
|
||||
|
||||
2. The repository contains a Dockerfile from which you can build a Docker image as:
|
||||
|
||||
```bash
|
||||
docker build -t migraphx .
|
||||
```
|
||||
|
||||
3. Then to enter the development environment, use Docker run:
|
||||
|
||||
```bash
|
||||
docker run --device='/dev/kfd' --device='/dev/dri' -v=`pwd`:/code/AMDMIGraphX -w /code/AMDMIGraphX --group-add video -it migraphx
|
||||
```
|
||||
|
||||
The Docker image contains all the prerequisites required for the installation, so users can go to the folder `/code/AMDMIGraphX` and follow the steps mentioned in [Option 2: Building from Source](#option-2-building-from-source).
|
||||
|
||||
## MIGraphX example
|
||||
|
||||
MIGraphX provides both C++ and Python APIs. The following sections show examples of both using the Inception v3 model. To walk through the examples, fetch the Inception v3 ONNX model by running the following:
|
||||
|
||||
```py
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
inception = models.inception_v3(pretrained=True)
|
||||
torch.onnx.export(inception,torch.randn(1,3,299,299), "inceptioni1.onnx")
|
||||
```
|
||||
|
||||
This will create `inceptioni1.onnx`, which can be imported in MIGraphX using C++ or Python API.
|
||||
|
||||
### MIGraphX Python API
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. To import the MIGraphX module in Python script, set `PYTHONPATH` to the MIGraphX libraries installation. If binaries are installed using steps mentioned in [Option 1: Installing Binaries](#option-1-installing-binaries), perform the following action:
|
||||
|
||||
```bash
|
||||
export PYTHONPATH=$PYTHONPATH:/opt/rocm/
|
||||
```
|
||||
|
||||
2. The following script shows the usage of Python API to import the ONNX model, compile it, and run inference on it. Set `LD_LIBRARY_PATH` to `/opt/rocm/` if required.
|
||||
|
||||
```py
|
||||
# import migraphx and numpy
|
||||
import migraphx
|
||||
import numpy as np
|
||||
# import and parse inception model
|
||||
model = migraphx.parse_onnx("inceptioni1.onnx")
|
||||
# compile model for the GPU target
|
||||
model.compile(migraphx.get_target("gpu"))
|
||||
# optionally print compiled model
|
||||
model.print()
|
||||
# create random input image
|
||||
input_image = np.random.rand(1, 3, 299, 299).astype('float32')
|
||||
# feed image to model, 'x.1` is the input param name
|
||||
results = model.run({'x.1': input_image})
|
||||
# get the results back
|
||||
result_np = np.array(results[0])
|
||||
# print the inferred class of the input image
|
||||
print(np.argmax(result_np))
|
||||
```
|
||||
|
||||
Find additional examples of Python API in the `/examples` directory of the MIGraphX repository.
|
||||
|
||||
## MIGraphX C++ API
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. The following is a minimalist example that shows the usage of MIGraphX C++ API to load ONNX file, compile it for the GPU, and run inference on it. To use MIGraphX C++ API, you only need to load the `migraphx.hpp` file. This example runs inference on the Inception v3 model.
|
||||
|
||||
```c++
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <migraphx/migraphx.hpp>
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
migraphx::program prog;
|
||||
migraphx::onnx_options onnx_opts;
|
||||
// import and parse onnx file into migraphx::program
|
||||
prog = parse_onnx("inceptioni1.onnx", onnx_opts);
|
||||
// print imported model
|
||||
prog.print();
|
||||
migraphx::target targ = migraphx::target("gpu");
|
||||
migraphx::compile_options comp_opts;
|
||||
comp_opts.set_offload_copy();
|
||||
// compile for the GPU
|
||||
prog.compile(targ, comp_opts);
|
||||
// print the compiled program
|
||||
prog.print();
|
||||
// randomly generate input image
|
||||
// of shape (1, 3, 299, 299)
|
||||
std::srand(unsigned(std::time(nullptr)));
|
||||
std::vector<float> input_image(1*299*299*3);
|
||||
std::generate(input_image.begin(), input_image.end(), std::rand);
|
||||
// users need to provide data for the input
|
||||
// parameters in order to run inference
|
||||
// you can query into migraph program for the parameters
|
||||
migraphx::program_parameters prog_params;
|
||||
auto param_shapes = prog.get_parameter_shapes();
|
||||
auto input = param_shapes.names().front();
|
||||
// create argument for the parameter
|
||||
prog_params.add(input, migraphx::argument(param_shapes[input], input_image.data()));
|
||||
// run inference
|
||||
auto outputs = prog.eval(prog_params);
|
||||
// read back the output
|
||||
float* results = reinterpret_cast<float*>(outputs[0].data());
|
||||
float* max = std::max_element(results, results + 1000);
|
||||
int answer = max - results;
|
||||
std::cout << "answer: " << answer << std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
2. To compile this program, you can use CMake and you only need to link the `migraphx::c` library to use MIGraphX's C++ API. The following is the `CMakeLists.txt` file that can build the earlier example:
|
||||
|
||||
```cmake
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
project (CAI)
|
||||
|
||||
set (CMAKE_CXX_STANDARD 14)
|
||||
set (EXAMPLE inception_inference)
|
||||
|
||||
list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
find_package (migraphx)
|
||||
|
||||
message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
|
||||
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
|
||||
|
||||
target_link_libraries(${EXAMPLE} migraphx::c)
|
||||
```
|
||||
|
||||
3. To build the executable file, run the following from the directory containing the `inception_inference.cpp` file:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j$(nproc)
|
||||
./inception_inference
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Set `LD_LIBRARY_PATH` to `/opt/rocm/lib` if required during the build. Additional examples can be found in the MIGraphX repository under the `/examples/` directory.
|
||||
:::
|
||||
|
||||
## Tuning MIGraphX
|
||||
|
||||
MIGraphX uses MIOpen kernels to target AMD GPU. For the model compiled with MIGraphX, tune MIOpen to pick the best possible kernel implementation. The MIOpen tuning results in a significant performance boost. Tuning can be done by setting the environment variable `MIOPEN_FIND_ENFORCE=3`.
|
||||
|
||||
:::{note}
|
||||
The tuning process can take a long time to finish.
|
||||
:::
|
||||
|
||||
**Example:** The average inference time of the inception model example shown previously over 100 iterations using untuned kernels is 0.01383ms. After tuning, it reduces to 0.00459ms, which is a 3x improvement. This result is from ROCm v4.5 on a MI100 GPU.
|
||||
|
||||
:::{note}
|
||||
The results may vary depending on the system configurations.
|
||||
:::
|
||||
|
||||
For reference, the following code snippet shows inference runs for only the first 10 iterations for both tuned and untuned kernels:
|
||||
|
||||
```console
|
||||
### UNTUNED ###
|
||||
iterator : 0
|
||||
Inference complete
|
||||
Inference time: 0.063ms
|
||||
iterator : 1
|
||||
Inference complete
|
||||
Inference time: 0.008ms
|
||||
iterator : 2
|
||||
Inference complete
|
||||
Inference time: 0.007ms
|
||||
iterator : 3
|
||||
Inference complete
|
||||
Inference time: 0.007ms
|
||||
iterator : 4
|
||||
Inference complete
|
||||
Inference time: 0.007ms
|
||||
iterator : 5
|
||||
Inference complete
|
||||
Inference time: 0.008ms
|
||||
iterator : 6
|
||||
Inference complete
|
||||
Inference time: 0.007ms
|
||||
iterator : 7
|
||||
Inference complete
|
||||
Inference time: 0.028ms
|
||||
iterator : 8
|
||||
Inference complete
|
||||
Inference time: 0.029ms
|
||||
iterator : 9
|
||||
Inference complete
|
||||
Inference time: 0.029ms
|
||||
|
||||
### TUNED ###
|
||||
iterator : 0
|
||||
Inference complete
|
||||
Inference time: 0.063ms
|
||||
iterator : 1
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 2
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 3
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 4
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 5
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 6
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 7
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 8
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
iterator : 9
|
||||
Inference complete
|
||||
Inference time: 0.004ms
|
||||
```
|
||||
|
||||
### YModel
|
||||
|
||||
The best inference performance through MIGraphX is conditioned upon having tuned kernel configurations stored in a `/home` local User Database (DB). If a user were to move their model to a different server or allow a different user to use it, they would have to run through the MIOpen tuning process again to populate the next User DB with the best kernel configurations and corresponding solvers.
|
||||
|
||||
Tuning is time consuming, and if the users have not performed tuning, they would see discrepancies between expected or claimed inference performance and actual inference performance. This has led to repetitive and time-consuming tuning tasks for each user.
|
||||
|
||||
MIGraphX introduces a feature, known as YModel, that stores the kernel config parameters found during tuning into a `.mxr` file. This ensures the same level of expected performance, even when a model is copied to a different user/system.
|
||||
|
||||
The YModel feature is available starting from ROCm 5.4.1 and UIF 1.1.
|
||||
|
||||
#### YModel example
|
||||
|
||||
Through the `migraphx-driver` functionality, you can generate `.mxr` files with tuning information stored inside it by passing additional `--binary --output model.mxr` to `migraphx-driver` along with the rest of the necessary flags.
|
||||
|
||||
For example, to generate `.mxr` file from the ONNX model, use the following:
|
||||
|
||||
```bash
|
||||
./path/to/migraphx-driver compile --onnx resnet50.onnx --enable-offload-copy --binary --output resnet50.mxr
|
||||
```
|
||||
|
||||
To run generated `.mxr` files through `migraphx-driver`, use the following:
|
||||
|
||||
```bash
|
||||
./path/to/migraphx-driver run --migraphx resnet50.mxr --enable-offload-copy
|
||||
```
|
||||
|
||||
Alternatively, you can use the MIGraphX C++ or Python API to generate `.mxr` files.
|
||||
|
||||

|
||||
@@ -53,7 +53,7 @@ The following sections contain case studies for the Inception V3 model.
|
||||
|
||||
### Inception V3 with PyTorch
|
||||
|
||||
Convolution Neural Networks are forms of artificial neural networks commonly used for image processing. One of the core layers of such a network is the convolutional layer, which convolves the input with a weight tensor and passes the result to the next layer. Inception V3 is an architectural development over the ImageNet competition-winning entry, AlexNet, using more profound and broader networks while attempting to meet computational and memory budgets.
|
||||
Convolution Neural Networks are forms of artificial neural networks commonly used for image processing. One of the core layers of such a network is the convolutional layer, which convolves the input with a weight tensor and passes the result to the next layer. Inception V3[^inception_arch] is an architectural development over the ImageNet competition-winning entry, AlexNet, using more profound and broader networks while attempting to meet computational and memory budgets.
|
||||
|
||||
The implementation uses PyTorch as a framework. This case study utilizes [TorchVision](https://pytorch.org/vision/stable/index.html), a repository of popular datasets and model architectures, for obtaining the model. TorchVision also provides pre-trained weights as a starting point to develop new models or fine-tune the model for a new task.
|
||||
|
||||
@@ -65,7 +65,7 @@ This example is adapted from the PyTorch research hub page on [Inception V3](htt
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
|
||||
1. Run the PyTorch ROCm-based Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
|
||||
|
||||
```dockerfile
|
||||
docker run -it -v $HOME:/data --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest
|
||||
@@ -155,14 +155,14 @@ The previous section focused on downloading and using the Inception V3 model for
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:install/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
|
||||
1. Run the PyTorch ROCm Docker image or refer to the section {doc}`Installing PyTorch <rocm-install-on-linux:how-to/3rd-party/pytorch-install>` for setting up a PyTorch environment on ROCm.
|
||||
|
||||
```dockerfile
|
||||
docker pull rocm/pytorch:latest
|
||||
docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest
|
||||
```
|
||||
|
||||
2. Download an ImageNet database. For this example, the `tiny-imagenet-200`, a smaller ImageNet variant with 200 image classes and a training dataset with 100,000 images, was downsized to 64x64 color images.
|
||||
2. Download an ImageNet database. For this example, the `tiny-imagenet-200`[^Stanford_deep_learning], a smaller ImageNet variant with 200 image classes and a training dataset with 100,000 images, was downsized to 64x64 color images.
|
||||
|
||||
```bash
|
||||
wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
|
||||
@@ -366,7 +366,7 @@ Follow these steps:
|
||||
model.to(device)
|
||||
```
|
||||
|
||||
13. Set the loss criteria. For this example, Cross Entropy Loss is used.
|
||||
13. Set the loss criteria. For this example, Cross Entropy Loss[^cross_entropy] is used.
|
||||
|
||||
```py
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
@@ -586,7 +586,7 @@ Follow these steps:
|
||||
import torch.optim as optim
|
||||
```
|
||||
|
||||
10. Set the loss criteria. For this example, Cross Entropy Loss is used.
|
||||
10. Set the loss criteria. For this example, Cross Entropy Loss[^cross_entropy] is used.
|
||||
|
||||
```py
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
21
docs/conceptual/compiler-disambiguation.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="ROCm compilers disambiguation">
|
||||
<meta name="keywords" content="compilers, compiler naming, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# ROCm compilers disambiguation
|
||||
|
||||
ROCm ships multiple compilers of varying origins and purposes. This article
|
||||
disambiguates compiler naming used throughout the documentation.
|
||||
|
||||
## Compiler terms
|
||||
|
||||
| Term | Description |
|
||||
| - | - |
|
||||
| `amdclang++` | Clang/LLVM-based compiler that is part of `rocm-llvm` package. The source code is available at <a href="https://github.com/ROCm/llvm-project" target="_blank">https://github.com/ROCm/llvm-project</a>. |
|
||||
| AOCC | Closed-source clang-based compiler that includes additional CPU optimizations. Offered as part of ROCm via the `rocm-llvm-alt` package. See for details, <a href="https://developer.amd.com/amd-aocc/" target="_blank">https://developer.amd.com/amd-aocc/</a>. |
|
||||
| HIP-Clang | Informal term for the `amdclang++` compiler |
|
||||
| HIPIFY | Tools including `hipify-clang` and `hipify-perl`, used to automatically translate CUDA source code into portable HIP C++. The source code is available at <a href="https://github.com/ROCm/HIPIFY" target="_blank">https://github.com/ROCm/HIPIFY</a> |
|
||||
| `hipcc` | HIP compiler driver. A utility that invokes `clang` or `nvcc` depending on the target and passes the appropriate include and library options for the target compiler and HIP infrastructure. The source code is available at <a href="https://github.com/ROCm/HIPCC" target="_blank">https://github.com/ROCm/HIPCC</a>. |
|
||||
| ROCmCC | Clang/LLVM-based compiler. ROCmCC in itself is not a binary but refers to the overall compiler. |
|
||||
@@ -1,14 +0,0 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="AMD ROCm documentation">
|
||||
<meta name="keywords" content="documentation, guides, installation, compatibility, support,
|
||||
reference, ROCm, AMD">
|
||||
</head>
|
||||
|
||||
# Using compiler features
|
||||
|
||||
The following topics describe using specific features of the compilation tools:
|
||||
|
||||
* [ROCm compiler infrastructure](https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html)
|
||||
* [Using AddressSanitizer](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html)
|
||||
* [OpenMP support](https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html)
|
||||
@@ -13,27 +13,26 @@
|
||||
:gutter: 1
|
||||
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI300 Series**
|
||||
**AMD Instinct MI300 series**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI300 Series GPUs and the CDNA™ 3
|
||||
Review hardware aspects of the AMD Instinct™ MI300 series of GPU accelerators and the CDNA™ 3
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
|
||||
* [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
|
||||
* [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
* [MI350 Series performance counters](./gpu-arch/mi350-performance-counters.rst)
|
||||
* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI200 Series**
|
||||
**AMD Instinct MI200 series**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI200 Series GPUs and the CDNA™ 2
|
||||
Review hardware aspects of the AMD Instinct™ MI200 series of GPU accelerators and the CDNA™ 2
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
|
||||
* [AMD Instinct MI200/CDNA2 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
|
||||
* [Performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
|
||||
|
||||
:::
|
||||
@@ -41,22 +40,22 @@ architecture.
|
||||
:::{grid-item-card}
|
||||
**AMD Instinct MI100**
|
||||
|
||||
Review hardware aspects of the AMD Instinct™ MI100 Series GPUs and the CDNA™ 1
|
||||
Review hardware aspects of the AMD Instinct™ MI100 series of GPU accelerators and the CDNA™ 1
|
||||
architecture.
|
||||
|
||||
* [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
|
||||
* [AMD Instinct MI100/CDNA1 ISA](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf)
|
||||
* [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf)
|
||||
* [White paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
:::{grid-item-card}
|
||||
**RDNA**
|
||||
|
||||
* [AMD RDNA4 ISA](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna4-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA3 ISA](https://www.amd.com/system/files/TechDocs/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf)
|
||||
* [AMD RDNA2 ISA](https://www.amd.com/system/files/TechDocs/rdna2-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA ISA](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf)
|
||||
* [AMD RDNA Architecture White Paper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf)
|
||||
|
||||
:::
|
||||
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI100 Series architecture."
|
||||
"keywords": "Instinct, MI100, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="AMD Instinct MI100 microarchitecture">
|
||||
<meta name="keywords" content="Instinct, MI100, microarchitecture, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# AMD Instinct™ MI100 microarchitecture
|
||||
|
||||
The following image shows the node-level architecture of a system that
|
||||
comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ GPUs.
|
||||
comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ accelerators.
|
||||
The two EPYC processors are connected to each other with the AMD Infinity™
|
||||
fabric which provides a high-bandwidth (up to 18 GT/sec) and coherent links such
|
||||
that each processor can access the available node memory as a single
|
||||
@@ -18,29 +17,29 @@ available to connect the processors plus one PCIe Gen 4 x16 link per processor
|
||||
can attach additional I/O devices such as the host adapters for the network
|
||||
fabric.
|
||||
|
||||

|
||||

|
||||
|
||||
In a typical node configuration, each processor can host up to four AMD
|
||||
Instinct™ GPUs that are attached using PCIe Gen 4 links at 16 GT/sec,
|
||||
Instinct™ accelerators that are attached using PCIe Gen 4 links at 16 GT/sec,
|
||||
which corresponds to a peak bidirectional link bandwidth of 32 GB/sec. Each hive
|
||||
of four GPUs can participate in a fully connected, coherent AMD
|
||||
Instinct™ fabric that connects the four GPUs using 23 GT/sec AMD
|
||||
of four accelerators can participate in a fully connected, coherent AMD
|
||||
Instinct™ fabric that connects the four accelerators using 23 GT/sec AMD
|
||||
Infinity fabric links that run at a higher frequency than the inter-processor
|
||||
links. This inter-GPU link can be established in certified server systems if the
|
||||
GPUs are mounted in neighboring PCIe slots by installing the AMD Infinity
|
||||
Fabric™ bridge for the AMD Instinct™ GPUs.
|
||||
Fabric™ bridge for the AMD Instinct™ accelerators.
|
||||
|
||||
## Microarchitecture
|
||||
|
||||
The microarchitecture of the AMD Instinct GPUs is based on the AMD CDNA
|
||||
The microarchitecture of the AMD Instinct accelerators is based on the AMD CDNA
|
||||
architecture, which targets compute applications such as high-performance
|
||||
computing (HPC) and AI & machine learning (ML) that run on everything from
|
||||
individual servers to the world's largest exascale supercomputers. The overall
|
||||
system architecture is designed for extreme scalability and compute performance.
|
||||
|
||||
")
|
||||
")
|
||||
|
||||
The above image shows the AMD Instinct GPU with its PCIe Gen 4 x16
|
||||
The above image shows the AMD Instinct accelerator with its PCIe Gen 4 x16
|
||||
link (16 GT/sec, at the bottom) that connects the GPU to (one of) the host
|
||||
processor(s). It also shows the three AMD Infinity Fabric ports that provide
|
||||
high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local
|
||||
@@ -48,7 +47,7 @@ hive.
|
||||
|
||||
On the left and right of the floor plan, the High Bandwidth Memory (HBM)
|
||||
attaches via the GPU memory controller. The MI100 generation of the AMD
|
||||
Instinct GPU offers four stacks of HBM generation 2 (HBM2) for a total
|
||||
Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total
|
||||
of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the
|
||||
attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz.
|
||||
|
||||
@@ -64,7 +63,7 @@ Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS
|
||||

|
||||
|
||||
The preceding image shows the block diagram of a single CU of an AMD Instinct™
|
||||
MI100 GPU and summarizes how instructions flow through the execution
|
||||
MI100 accelerator and summarizes how instructions flow through the execution
|
||||
engines. The CU fetches the instructions via a 32KB instruction cache and moves
|
||||
them forward to execution via a dispatcher. The CU can handle up to ten
|
||||
wavefronts at a time and feed their instructions into the execution unit. The
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI250 Series architecture."
|
||||
"keywords": "Instinct, MI250, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="AMD Instinct MI250 microarchitecture">
|
||||
<meta name="keywords" content="Instinct, MI250, microarchitecture, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# AMD Instinct™ MI250 microarchitecture
|
||||
|
||||
The microarchitecture of the AMD Instinct MI250 GPU is based on the
|
||||
The microarchitecture of the AMD Instinct MI250 accelerators is based on the
|
||||
AMD CDNA 2 architecture that targets compute applications such as HPC,
|
||||
artificial intelligence (AI), and machine learning (ML) and that run on
|
||||
everything from individual servers to the world’s largest exascale
|
||||
@@ -34,13 +33,13 @@ Units (CU). The MI250 GCD has 104 active CUs. Each compute unit is further
|
||||
subdivided into four SIMD units that process SIMD instructions of 16 data
|
||||
elements per instruction (for the FP64 data type). This enables the CU to
|
||||
process 64 work items (a so-called “wavefront”) at a peak clock frequency of 1.7
|
||||
GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 22.6
|
||||
TFLOPS for vector instructions. This equates to 45.3 TFLOPS for vector instructions for both GCDs together. The MI250 compute units also provide specialized
|
||||
GHz. Therefore, the theoretical maximum FP64 peak performance per GCD is 45.3
|
||||
TFLOPS for vector instructions. The MI250 compute units also provide specialized
|
||||
execution units (also called matrix cores), which are geared toward executing
|
||||
matrix operations like matrix-matrix multiplications. For FP64, the peak
|
||||
performance of these units amounts to 90.5 TFLOPS.
|
||||
|
||||

|
||||

|
||||
|
||||
```{list-table} Peak-performance capabilities of the MI250 OAM for different data types.
|
||||
:header-rows: 1
|
||||
@@ -84,9 +83,16 @@ performance of these units amounts to 90.5 TFLOPS.
|
||||
- 362.1
|
||||
```
|
||||
|
||||
The above table summarizes the aggregated peak performance of the AMD Instinct MI250 Open Compute Platform (OCP) Open Accelerator Modules (OAMs) and its two GCDs for different data types and execution units. The middle column lists the peak performance (number of data elements processed in a single instruction) of a single compute unit if a SIMD (or matrix) instruction is being retired in each clock cycle. The third column lists the theoretical peak performance of the OAM module. The theoretical aggregated peak memory bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).
|
||||
The above table summarizes the aggregated peak performance of the AMD
|
||||
Instinct MI250 OCP Open Accelerator Modules (OAM, OCP is short for Open Compute
|
||||
Platform) and its two GCDs for different data types and execution units. The
|
||||
middle column lists the peak performance (number of data elements processed in a
|
||||
single instruction) of a single compute unit if a SIMD (or matrix) instruction
|
||||
is being retired in each clock cycle. The third column lists the theoretical
|
||||
peak performance of the OAM module. The theoretical aggregated peak memory
|
||||
bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).
|
||||
|
||||

|
||||

|
||||
|
||||
The following image shows the block diagram of an OAM package that consists
|
||||
of two GCDs, each of which constitutes one GPU device in the system. The two
|
||||
@@ -98,18 +104,18 @@ between the two GCDs of an OAM, or a bidirectional peak transfer bandwidth of
|
||||
## Node-level architecture
|
||||
|
||||
The following image shows the node-level architecture of a system that is
|
||||
based on the AMD Instinct MI250 GPU. The MI250 OAMs attach to the host
|
||||
based on the AMD Instinct MI250 accelerator. The MI250 OAMs attach to the host
|
||||
system via PCIe Gen 4 x16 links (yellow lines). Each GCD maintains its own PCIe
|
||||
x16 link to the host part of the system. Depending on the server platform, the
|
||||
GCD can attach to the AMD EPYC processor directly or via an optional PCIe switch
|
||||
. Note that some platforms may offer an x8 interface to the GCDs, which reduces
|
||||
the available host-to-GPU bandwidth.
|
||||
|
||||

|
||||

|
||||
|
||||
The preceding image shows the node-level architecture of a system with AMD
|
||||
EPYC processors in a dual-socket configuration and four AMD Instinct MI250
|
||||
GPUs. The MI250 OAMs attach to the host processors system via PCIe Gen 4
|
||||
accelerators. The MI250 OAMs attach to the host processors system via PCIe Gen 4
|
||||
x16 links (yellow lines). Depending on the system design, a PCIe switch may
|
||||
exist to make more PCIe lanes available for additional components like network
|
||||
interfaces and/or storage devices. Each GCD maintains its own PCIe x16 link to
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
.. meta::
|
||||
:description: MI300 and MI200 Series performance counters and metrics
|
||||
:description: MI300 and MI200 series performance counters and metrics
|
||||
:keywords: MI300, MI200, performance counters, command processor counters
|
||||
|
||||
***************************************************************************************************
|
||||
MI300 and MI200 Series performance counters and metrics
|
||||
MI300 and MI200 series performance counters and metrics
|
||||
***************************************************************************************************
|
||||
|
||||
This document lists and describes the hardware performance counters and derived metrics available
|
||||
for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.
|
||||
:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
|
||||
|
||||
MI300 and MI200 Series performance counters
|
||||
MI300 and MI200 series performance counters
|
||||
===============================================================
|
||||
|
||||
Series performance counters include the following categories:
|
||||
@@ -27,7 +27,7 @@ The following sections provide additional details for each category.
|
||||
|
||||
.. note::
|
||||
|
||||
Preliminary validation of all MI300 and MI200 Series performance counters is in progress. Those with
|
||||
Preliminary validation of all MI300 and MI200 series performance counters is in progress. Those with
|
||||
an asterisk (*) require further evaluation.
|
||||
|
||||
.. _command-processor-counters:
|
||||
@@ -171,7 +171,7 @@ Instruction mix
|
||||
"``SQ_INSTS_SMEM``", "Instr", "Number of scalar memory instructions issued"
|
||||
"``SQ_INSTS_SMEM_NORM``", "Instr", "Number of scalar memory instructions normalized to match ``smem_level`` issued"
|
||||
"``SQ_INSTS_FLAT``", "Instr", "Number of flat instructions issued"
|
||||
"``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 Series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
|
||||
"``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
|
||||
"``SQ_INSTS_LDS``", "Instr", "Number of LDS instructions issued **(MI200: includes flat; MI300: does not include flat)**"
|
||||
"``SQ_INSTS_GDS``", "Instr", "Number of global data share instructions issued"
|
||||
"``SQ_INSTS_EXP_GDS``", "Instr", "Number of EXP and global data share instructions excluding skipped export instructions issued"
|
||||
@@ -396,9 +396,9 @@ Texture cache per pipe counters
|
||||
"``TCP_UTCL1_TRANSLATION_MISS[n]``", "Req", "Number of unified translation cache (L1) translation misses", "0-15"
|
||||
"``TCP_UTCL1_PERMISSION_MISS[n]``", "Req", "Number of unified translation cache (L1) permission misses", "0-15"
|
||||
"``TCP_TOTAL_CACHE_ACCESSES[n]``", "Req", "Number of vector L1d cache accesses including hits and misses", "0-15"
|
||||
"``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 Series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
|
||||
"``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
|
||||
"``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
|
||||
"``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
|
||||
"``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
|
||||
"``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
|
||||
"``TCP_TCC_READ_REQ[n]``", "Req", "Number of read requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_WRITE_REQ[n]``", "Req", "Number of write requests to L2 cache", "0-15"
|
||||
"``TCP_TCC_ATOMIC_WITH_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache with return", "0-15"
|
||||
@@ -560,7 +560,7 @@ Note the following:
|
||||
``TCC_TAG_STALL[n]``, probes can stall the pipeline at a variety of places. There is no single point that
|
||||
can accurately measure the total stalls
|
||||
|
||||
MI300 and MI200 Series derived metrics list
|
||||
MI300 and MI200 series derived metrics list
|
||||
==============================================================
|
||||
|
||||
.. csv-table::
|
||||
@@ -615,6 +615,7 @@ The following table shows the hardware counters *by* all texture addressing unit
|
||||
"``TA_FLAT_READ_WAVEFRONTS_sum``", "Sum of flat opcode reads processed"
|
||||
"``TA_FLAT_WRITE_WAVEFRONTS_sum``", "Sum of flat opcode writes processed"
|
||||
"``TA_FLAT_WAVEFRONTS_sum``", "Total number of flat opcode wavefronts processed"
|
||||
"``TA_FLAT_READ_WAVEFRONTS_sum``", "Total number of flat opcode read wavefronts processed"
|
||||
"``TA_FLAT_ATOMIC_WAVEFRONTS_sum``", "Total number of flat opcode atomic wavefronts processed"
|
||||
"``TA_TOTAL_WAVEFRONTS_sum``", "Total number of wavefronts processed"
|
||||
|
||||
|
||||
@@ -1,23 +1,16 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description lang=en": "Learn about the AMD Instinct MI300 Series architecture."
|
||||
"keywords": "Instinct, MI300X, MI300A, microarchitecture, AMD, ROCm"
|
||||
---
|
||||
# AMD Instinct™ MI300 series microarchitecture
|
||||
|
||||
# AMD Instinct™ MI300 Series microarchitecture
|
||||
|
||||
The AMD Instinct MI300 Series GPUs are based on the AMD CDNA 3
|
||||
The AMD Instinct MI300 series accelerators are based on the AMD CDNA 3
|
||||
architecture which was designed to deliver leadership performance for HPC, artificial intelligence (AI), and machine
|
||||
learning (ML) workloads. The AMD Instinct MI300 Series GPUs are well-suited for extreme scalability and compute performance, running
|
||||
learning (ML) workloads. The AMD Instinct MI300 series accelerators are well-suited for extreme scalability and compute performance, running
|
||||
on everything from individual servers to the world’s largest exascale supercomputers.
|
||||
|
||||
With the MI300 Series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
|
||||
With the MI300 series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
|
||||
GPU computational elements of the processor along with the lower levels of the cache hierarchy.
|
||||
|
||||
The following image depicts the structure of a single XCD in the AMD Instinct MI300 GPU Series.
|
||||
The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.
|
||||
|
||||
```{figure} ../../data/shared/xcd-sys-arch.png
|
||||
```{figure} ../../data/conceptual/gpu-arch/image007.png
|
||||
---
|
||||
name: mi300-xcd
|
||||
align: center
|
||||
@@ -39,7 +32,7 @@ infrastructure) using the AMD Infinity Fabric™ technology as interconnect.
|
||||
The Matrix Cores inside the CDNA 3 CUs have significant improvements, emphasizing AI and machine
|
||||
learning, enhancing throughput of existing data types while adding support for new data types.
|
||||
CDNA 2 Matrix Cores support FP16 and BF16, while offering INT8 for inference. Compared to MI250X
|
||||
GPUs, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
|
||||
accelerators, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
|
||||
performance gain of 6.8 times for INT8. FP8 has a performance gain of 16 times compared to FP32,
|
||||
while TF32 has a gain of 4 times compared to FP32.
|
||||
|
||||
@@ -105,22 +98,22 @@ name: mi300-arch
|
||||
alt:
|
||||
align: center
|
||||
---
|
||||
MI300 Series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
|
||||
MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
|
||||
```
|
||||
|
||||
## Node-level architecture
|
||||
|
||||
```{figure} ../../data/shared/mi300-node-level-arch.png
|
||||
```{figure} ../../data/conceptual/gpu-arch/image009.png
|
||||
---
|
||||
name: mi300-node
|
||||
|
||||
align: center
|
||||
---
|
||||
MI300 Series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
|
||||
MI300 series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
|
||||
```
|
||||
|
||||
The image above shows the node-level architecture of a system with AMD EPYC processors in a
|
||||
dual-socket configuration and eight AMD Instinct MI300X GPUs. The MI300X OAMs attach to the
|
||||
dual-socket configuration and eight AMD Instinct MI300X accelerators. The MI300X OAMs attach to the
|
||||
host system via PCIe Gen 5 x16 links (yellow lines). The GPUs are using seven high-bandwidth,
|
||||
low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected 8-GPU system.
|
||||
|
||||
|
||||
@@ -1,530 +0,0 @@
|
||||
.. meta::
|
||||
:description: MI355 Series performance counters and metrics
|
||||
:keywords: MI355, MI355X, MI3XX
|
||||
|
||||
***********************************
|
||||
MI350 Series performance counters
|
||||
***********************************
|
||||
|
||||
This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 GPUs. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
|
||||
|
||||
The following sections list the performance counters based on the IP blocks.
|
||||
|
||||
Command processor packet processor counters (CPC)
|
||||
==================================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - CPC_ALWAYS_COUNT
|
||||
- Always count.
|
||||
|
||||
* - CPC_ADC_VALID_CHUNK_NOT_AVAIL
|
||||
- ADC valid chunk is not available when dispatch walking is in progress in the multi-xcc mode.
|
||||
|
||||
* - CPC_ADC_DISPATCH_ALLOC_DONE
|
||||
- ADC dispatch allocation is done.
|
||||
|
||||
* - CPC_ADC_VALID_CHUNK_END
|
||||
- ADC crawler's valid chunk end in the multi-xcc mode.
|
||||
|
||||
* - CPC_SYNC_FIFO_FULL_LEVEL
|
||||
- SYNC FIFO full last cycles.
|
||||
|
||||
* - CPC_SYNC_FIFO_FULL
|
||||
- SYNC FIFO full times.
|
||||
|
||||
* - CPC_GD_BUSY
|
||||
- ADC busy.
|
||||
|
||||
* - CPC_TG_SEND
|
||||
- ADC thread group send.
|
||||
|
||||
* - CPC_WALK_NEXT_CHUNK
|
||||
- ADC walking next valid chunk in the multi-xcc mode.
|
||||
|
||||
* - CPC_STALLED_BY_SE0_SPI
|
||||
- ADC CSDATA stalled by SE0SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE1_SPI
|
||||
- ADC CSDATA stalled by SE1SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE2_SPI
|
||||
- ADC CSDATA stalled by SE2SPI.
|
||||
|
||||
* - CPC_STALLED_BY_SE3_SPI
|
||||
- ADC CSDATA stalled by SE3SPI.
|
||||
|
||||
* - CPC_LTE_ALL
|
||||
- CPC sync counter LteAll. Only Master XCD manages LteAll.
|
||||
|
||||
* - CPC_SYNC_WRREQ_FIFO_BUSY
|
||||
- CPC sync counter request FIFO is not empty.
|
||||
|
||||
* - CPC_CANE_BUSY
|
||||
- CPC CANE bus is busy, which indicates the presence of inflight sync counter requests.
|
||||
|
||||
* - CPC_CANE_STALL
|
||||
- CPC sync counter sending is stalled by CANE.
|
||||
|
||||
Shader pipe interpolators (SPI) counters
|
||||
=========================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - SPI_CS0_WINDOW_VALID
|
||||
- Clock count enabled by PIPE0 perfcounter_start event.
|
||||
|
||||
* - SPI_CS0_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE0 (SPI or SH).
|
||||
|
||||
* - SPI_CS0_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE0.
|
||||
|
||||
* - SPI_CS0_CRAWLER_STALL
|
||||
- Number of clocks when PIPE0 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS0_EVENT_WAVE
|
||||
- Number of PIPE0 events and waves.
|
||||
|
||||
* - SPI_CS0_WAVE
|
||||
- Number of PIPE0 waves.
|
||||
|
||||
* - SPI_CS1_WINDOW_VALID
|
||||
- Clock count enabled by PIPE1 perfcounter_start event.
|
||||
|
||||
* - SPI_CS1_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE1 (SPI or SH).
|
||||
|
||||
* - SPI_CS1_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE1.
|
||||
|
||||
* - SPI_CS1_CRAWLER_STALL
|
||||
- Number of clocks when PIPE1 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS1_EVENT_WAVE
|
||||
- Number of PIPE1 events and waves.
|
||||
|
||||
* - SPI_CS1_WAVE
|
||||
- Number of PIPE1 waves.
|
||||
|
||||
* - SPI_CS2_WINDOW_VALID
|
||||
- Clock count enabled by PIPE2 perfcounter_start event.
|
||||
|
||||
* - SPI_CS2_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE2 (SPI or SH).
|
||||
|
||||
* - SPI_CS2_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE2.
|
||||
|
||||
* - SPI_CS2_CRAWLER_STALL
|
||||
- Number of clocks when PIPE2 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS2_EVENT_WAVE
|
||||
- Number of PIPE2 events and waves.
|
||||
|
||||
* - SPI_CS2_WAVE
|
||||
- Number of PIPE2 waves.
|
||||
|
||||
* - SPI_CS3_WINDOW_VALID
|
||||
- Clock count enabled by PIPE3 perfcounter_start event.
|
||||
|
||||
* - SPI_CS3_BUSY
|
||||
- Number of clocks with outstanding waves for PIPE3 (SPI or SH).
|
||||
|
||||
* - SPI_CS3_NUM_THREADGROUPS
|
||||
- Number of thread groups launched for PIPE3.
|
||||
|
||||
* - SPI_CS3_CRAWLER_STALL
|
||||
- Number of clocks when PIPE3 event or wave order FIFO is full.
|
||||
|
||||
* - SPI_CS3_EVENT_WAVE
|
||||
- Number of PIPE3 events and waves.
|
||||
|
||||
* - SPI_CS3_WAVE
|
||||
- Number of PIPE3 waves.
|
||||
|
||||
* - SPI_CSQ_P0_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue0.
|
||||
|
||||
* - SPI_CSQ_P0_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue1.
|
||||
|
||||
* - SPI_CSQ_P0_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue2.
|
||||
|
||||
* - SPI_CSQ_P0_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue3.
|
||||
|
||||
* - SPI_CSQ_P0_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue4.
|
||||
|
||||
* - SPI_CSQ_P0_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue5.
|
||||
|
||||
* - SPI_CSQ_P0_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue6.
|
||||
|
||||
* - SPI_CSQ_P0_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE0 Queue7.
|
||||
|
||||
* - SPI_CSQ_P1_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue0.
|
||||
|
||||
* - SPI_CSQ_P1_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue1.
|
||||
|
||||
* - SPI_CSQ_P1_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue2.
|
||||
|
||||
* - SPI_CSQ_P1_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue3.
|
||||
|
||||
* - SPI_CSQ_P1_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue4.
|
||||
|
||||
* - SPI_CSQ_P1_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue5.
|
||||
|
||||
* - SPI_CSQ_P1_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue6.
|
||||
|
||||
* - SPI_CSQ_P1_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE1 Queue7.
|
||||
|
||||
* - SPI_CSQ_P2_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue0.
|
||||
|
||||
* - SPI_CSQ_P2_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue1.
|
||||
|
||||
* - SPI_CSQ_P2_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue2.
|
||||
|
||||
* - SPI_CSQ_P2_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue3.
|
||||
|
||||
* - SPI_CSQ_P2_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue4.
|
||||
|
||||
* - SPI_CSQ_P2_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue5.
|
||||
|
||||
* - SPI_CSQ_P2_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue6.
|
||||
|
||||
* - SPI_CSQ_P2_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE2 Queue7.
|
||||
|
||||
* - SPI_CSQ_P3_Q0_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue0.
|
||||
|
||||
* - SPI_CSQ_P3_Q1_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue1.
|
||||
|
||||
* - SPI_CSQ_P3_Q2_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue2.
|
||||
|
||||
* - SPI_CSQ_P3_Q3_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue3.
|
||||
|
||||
* - SPI_CSQ_P3_Q4_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue4.
|
||||
|
||||
* - SPI_CSQ_P3_Q5_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue5.
|
||||
|
||||
* - SPI_CSQ_P3_Q6_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue6.
|
||||
|
||||
* - SPI_CSQ_P3_Q7_OCCUPANCY
|
||||
- Sum of occupancy info for PIPE3 Queue7.
|
||||
|
||||
* - SPI_CSQ_P0_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE0 queues.
|
||||
|
||||
* - SPI_CSQ_P1_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE1 queues.
|
||||
|
||||
* - SPI_CSQ_P2_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE2 queues.
|
||||
|
||||
* - SPI_CSQ_P3_OCCUPANCY
|
||||
- Sum of occupancy info for all PIPE3 queues.
|
||||
|
||||
* - SPI_VWC0_VDATA_VALID_WR
|
||||
- Number of clocks VGPR bus_0 writes VGPRs.
|
||||
|
||||
* - SPI_VWC1_VDATA_VALID_WR
|
||||
- Number of clocks VGPR bus_1 writes VGPRs.
|
||||
|
||||
* - SPI_CSC_WAVE_CNT_BUSY
|
||||
- Number of cycles when there is any wave in the pipe.
|
||||
|
||||
Compute unit (SQ) counters
|
||||
===========================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - SQ_INSTS_VALU_MFMA_F6F4
|
||||
- Number of VALU V_MFMA_*_F6F4 instructions.
|
||||
|
||||
* - SQ_INSTS_VALU_MFMA_MOPS_F6F4
|
||||
- Number of VALU matrix with the performed math operations (add or mul) divided by 512, assuming a full EXEC mask of F6 or F4 data type.
|
||||
|
||||
* - SQ_ACTIVE_INST_VALU2
|
||||
- Number of quad-cycles when two VALU instructions are issued (per-simd, nondeterministic).
|
||||
|
||||
* - SQ_INSTS_LDS_LOAD
|
||||
- Number of LDS load instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_STORE
|
||||
- Number of LDS store instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_ATOMIC
|
||||
- Number of LDS atomic instructions issued (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_LOAD_BANDWIDTH
|
||||
- Total number of 64-bytes loaded (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_STORE_BANDWIDTH
|
||||
- Total number of 64-bytes written (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_LDS_ATOMIC_BANDWIDTH
|
||||
- Total number of 64-bytes atomic (instrSize * CountOnes(EXEC))/64 (per-simd, emulated).
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP16
|
||||
- Counts FLOPS per instruction on float 16 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP32
|
||||
- Counts FLOPS per instruction on float 32 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP64
|
||||
- Counts FLOPS per instruction on float 64 excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP16_TRANS
|
||||
- Counts FLOPS per instruction on float 16 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP32_TRANS
|
||||
- Counts FLOPS per instruction on float 32 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_FLOPS_FP64_TRANS
|
||||
- Counts FLOPS per instruction on float 64 trans excluding MFMA/SMFMA.
|
||||
|
||||
* - SQ_INSTS_VALU_IOPS
|
||||
- Counts OPS per instruction on integer or unsigned or bit data (per-simd, emulated).
|
||||
|
||||
* - SQ_LDS_DATA_FIFO_FULL
|
||||
- Number of cycles LDS data FIFO is full (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_LDS_CMD_FIFO_FULL
|
||||
- Number of cycles LDS command FIFO is full (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_TA_ADDR_FIFO_FULL
|
||||
- Number of cycles texture requests are stalled due to full address FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_TA_CMD_FIFO_FULL
|
||||
- Number of cycles texture requests are stalled due to full cmd FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQ_VMEM_WR_TA_DATA_FIFO_FULL
|
||||
- Number of cycles texture writes are stalled due to full data FIFO in TA (nondeterministic, unwindowed).
|
||||
|
||||
* - SQC_ICACHE_MISSES_DUPLICATE
|
||||
- Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
|
||||
|
||||
* - SQC_DCACHE_MISSES_DUPLICATE
|
||||
- Number of duplicate misses (access to a non-resident, miss pending CL) (per-SQ, per-Bank, nondeterministic).
|
||||
|
||||
Texture addressing (TA) unit counters
|
||||
======================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TA_BUFFER_READ_LDS_WAVEFRONTS
|
||||
- Number of buffer read wavefronts for LDS return processed by the TA.
|
||||
|
||||
* - TA_FLAT_READ_LDS_WAVEFRONTS
|
||||
- Number of flat opcode reads for LDS return processed by the TA.
|
||||
|
||||
Texture data (TD) unit counters
|
||||
================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TD_WRITE_ACKT_WAVEFRONT
|
||||
- Number of write acknowledgments, sent to SQ and not to SP.
|
||||
|
||||
* - TD_TD_SP_TRAFFIC
|
||||
- Number of times this TD sends data to the SP.
|
||||
|
||||
Texture cache per pipe (TCP) counters
|
||||
======================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TCP_TCP_TA_ADDR_STALL_CYCLES
|
||||
- TCP stalls TA addr interface.
|
||||
|
||||
* - TCP_TCP_TA_DATA_STALL_CYCLES
|
||||
- TCP stalls TA data interface. Now windowed.
|
||||
|
||||
* - TCP_LFIFO_STALL_CYCLES
|
||||
- Memory latency FIFOs full stall.
|
||||
|
||||
* - TCP_RFIFO_STALL_CYCLES
|
||||
- Memory Request FIFOs full stall.
|
||||
|
||||
* - TCP_TCR_RDRET_STALL
|
||||
- Write into cache stalled by read return from TCR.
|
||||
|
||||
* - TCP_PENDING_STALL_CYCLES
|
||||
- Stall due to data pending from L2.
|
||||
|
||||
* - TCP_UTCL1_SERIALIZATION_STALL
|
||||
- Total number of stalls caused due to serializing translation requests through the UTCL1.
|
||||
|
||||
* - TCP_UTCL1_THRASHING_STALL
|
||||
- Stall caused by thrashing feature in any probe. Lacks accuracy when the stall signal overlaps between probe0 and probe1, which is worse with MECO of thrashing deadlock. Some probe0 events could miss being counted in with MECO on. This perf count provides a rough thrashing estimate.
|
||||
|
||||
* - TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS
|
||||
- Translation miss_under_miss.
|
||||
|
||||
* - TCP_UTCL1_STALL_INFLIGHT_MAX
|
||||
- Total UTCL1 stalls due to inflight counter saturation.
|
||||
|
||||
* - TCP_UTCL1_STALL_LRU_INFLIGHT
|
||||
- Total UTCL1 stalls due to LRU cache line with inflight traffic.
|
||||
|
||||
* - TCP_UTCL1_STALL_MULTI_MISS
|
||||
- Total UTCL1 stalls due to arbitrated multiple misses.
|
||||
|
||||
* - TCP_UTCL1_LFIFO_FULL
|
||||
- Total UTCL1 and UTCL2 latency, which hides FIFO full cycles.
|
||||
|
||||
* - TCP_UTCL1_STALL_LFIFO_NOT_RES
|
||||
- Total UTCL1 stalls due to UTCL2 latency, which hides FIFO output (not resident).
|
||||
|
||||
* - TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS
|
||||
- Total UTCL1 stalls due to UTCL2_req being out of credits.
|
||||
|
||||
* - TCP_CLIENT_UTCL1_INFLIGHT
|
||||
- The sum of inflight client to UTCL1 requests per cycle.
|
||||
|
||||
* - TCP_TAGRAM0_REQ
|
||||
- Total L2 requests mapping to TagRAM 0 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM1_REQ
|
||||
- Total L2 requests mapping to TagRAM 1 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM2_REQ
|
||||
- Total L2 requests mapping to TagRAM 2 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TAGRAM3_REQ
|
||||
- Total L2 requests mapping to TagRAM 3 from this TCP to all TCCs.
|
||||
|
||||
* - TCP_TCP_LATENCY
|
||||
- Total TCP wave latency (from the first clock of wave entering to the first clock of wave leaving). Divide by TA_TCP_STATE_READ to find average wave latency.
|
||||
|
||||
* - TCP_TCC_READ_REQ_LATENCY
|
||||
- Total TCP to TCC request latency for reads and atomics with return. Not Windowed.
|
||||
|
||||
* - TCP_TCC_WRITE_REQ_LATENCY
|
||||
- Total TCP to TCC request latency for writes and atomics without return. Not Windowed.
|
||||
|
||||
* - TCP_TCC_WRITE_REQ_HOLE_LATENCY
|
||||
- Total TCP req to TCC hole latency for writes and atomics. Not Windowed.
|
||||
|
||||
Texture cache per channel (TCC) counters
|
||||
=========================================
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Hardware counter
|
||||
- Definition
|
||||
|
||||
* - TCC_READ_SECTORS
|
||||
- Total number of 32B data sectors in read requests.
|
||||
|
||||
* - TCC_WRITE_SECTORS
|
||||
- Total number of 32B data sectors in write requests.
|
||||
|
||||
* - TCC_ATOMIC_SECTORS
|
||||
- Total number of 32B data sectors in atomic requests.
|
||||
|
||||
* - TCC_BYPASS_REQ
|
||||
- Number of bypass requests. This is measured at the tag block.
|
||||
|
||||
* - TCC_LATENCY_FIFO_FULL
|
||||
- Number of cycles when the latency FIFO is full.
|
||||
|
||||
* - TCC_SRC_FIFO_FULL
|
||||
- Number of cycles when the SRC FIFO is assumed to be full as measured at the IB block.
|
||||
|
||||
* - TCC_EA0_RDREQ_64B
|
||||
- Number of 64-byte TCC/EA read requests.
|
||||
|
||||
* - TCC_EA0_RDREQ_128B
|
||||
- Number of 128-byte TCC/EA read requests.
|
||||
|
||||
* - TCC_IB_REQ
|
||||
- Number of requests through the IB. This measures the number of raw requests from graphics clients to this TCC.
|
||||
|
||||
* - TCC_IB_STALL
|
||||
- Number of cycles when the IB output is stalled.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_DRAM
|
||||
- Number of TCC/EA write requests (32-byte or 64-byte) destined for DRAM (MC).
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_DRAM
|
||||
- Number of TCC/EA atomic requests (32-byte or 64-byte) destined for DRAM (MC).
|
||||
|
||||
* - TCC_EA0_RDREQ_DRAM_32B
|
||||
- Number of 32-byte TCC/EA read requests due to DRAM traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_RDREQ_GMI_32B
|
||||
- Number of 32-byte TCC/EA read requests due to GMI traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_RDREQ_IO_32B
|
||||
- Number of 32-byte TCC/EA read requests due to IO traffic. One 64-byte request is counted as two and one 128-byte as four.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_DRAM_32B
|
||||
- Number of 32-byte TCC/EA write requests due to DRAM traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_DRAM_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to DRAM traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_GMI_32B
|
||||
- Number of 32-byte TCC/EA write requests due to GMI traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_GMI_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to GMI traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_WRITE_IO_32B
|
||||
- Number of 32-byte TCC/EA write requests due to IO traffic. One 64-byte request is counted as two.
|
||||
|
||||
* - TCC_EA0_WRREQ_ATOMIC_IO_32B
|
||||
- Number of 32-byte TCC/EA atomic requests due to IO traffic. One 64-byte request is counted as two.
|
||||
@@ -34,7 +34,7 @@ Runtime
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example to expose the 1. device and a device based on UUID.
|
||||
export ROCR_VISIBLE_DEVICES="0,GPU-4b2c1a9f-8d3e-6f7a-b5c9-2e4d8a1f6c3b"
|
||||
export ROCR_VISIBLE_DEVICES="0,GPU-DEADBEEFDEADBEEF"
|
||||
```
|
||||
|
||||
### `GPU_DEVICE_ORDINAL`
|
||||
@@ -42,7 +42,7 @@ export ROCR_VISIBLE_DEVICES="0,GPU-4b2c1a9f-8d3e-6f7a-b5c9-2e4d8a1f6c3b"
|
||||
Devices indices exposed to OpenCL and HIP applications.
|
||||
|
||||
Runtime
|
||||
: ROCm Compute Language Runtime (`ROCclr`). Applies to applications and runtimes
|
||||
: ROCm Common Language Runtime (`ROCclr`). Applies to applications and runtimes
|
||||
using the `ROCclr` abstraction layer including HIP and OpenCL applications.
|
||||
|
||||
```{code-block} shell
|
||||
|
||||
241
docs/conceptual/gpu-memory.md
Normal file
@@ -0,0 +1,241 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="GPU memory">
|
||||
<meta name="keywords" content="GPU memory, VRAM, video random access memory, pageable
|
||||
memory, pinned memory, managed memory, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# GPU memory
|
||||
|
||||
For the HIP reference documentation, see:
|
||||
|
||||
* {doc}`hip:doxygen/html/group___memory`
|
||||
* {doc}`hip:doxygen/html/group___memory_m`
|
||||
|
||||
Host memory exists on the host (e.g. CPU) of the machine in random access memory (RAM).
|
||||
|
||||
Device memory exists on the device (e.g. GPU) of the machine in video random access memory (VRAM).
|
||||
Recent architectures use graphics double data rate (GDDR) synchronous dynamic random-access memory (SDRAM)such as GDDR6, or high-bandwidth memory (HBM) such as HBM2e.
|
||||
|
||||
## Memory allocation
|
||||
|
||||
Memory can be allocated in two ways: pageable memory, and pinned memory.
|
||||
The following API calls with result in these allocations:
|
||||
|
||||
| API | Data location | Allocation |
|
||||
|--------------------|---------------|------------|
|
||||
| System allocated | Host | Pageable |
|
||||
| `hipMallocManaged` | Host | Managed |
|
||||
| `hipHostMalloc` | Host | Pinned |
|
||||
| `hipMalloc` | Device | Pinned |
|
||||
|
||||
:::{tip}
|
||||
`hipMalloc` and `hipFree` are blocking calls, however, HIP recently added non-blocking versions `hipMallocAsync` and `hipFreeAsync` which take in a stream as an additional argument.
|
||||
:::
|
||||
|
||||
### Pageable memory
|
||||
|
||||
Pageable memory is usually gotten when calling `malloc` or `new` in a C++ application.
|
||||
It is unique in that it exists on "pages" (blocks of memory), which can be migrated to other memory storage.
|
||||
For example, migrating memory between CPU sockets on a motherboard, or a system that runs out of space in RAM and starts dumping pages of RAM into the swap partition of your hard drive.
|
||||
|
||||
### Pinned memory
|
||||
|
||||
Pinned memory (or page-locked memory, or non-pageable memory) is host memory that is mapped into the address space of all GPUs, meaning that the pointer can be used on both host and device.
|
||||
Accessing host-resident pinned memory in device kernels is generally not recommended for performance, as it can force the data to traverse the host-device interconnect (e.g. PCIe), which is much slower than the on-device bandwidth (>40x on MI200).
|
||||
|
||||
Pinned host memory can be allocated with one of two types of coherence support:
|
||||
|
||||
:::{note}
|
||||
In HIP, pinned memory allocations are coherent by default (`hipHostMallocDefault`).
|
||||
There are additional pinned memory flags (e.g. `hipHostMallocMapped` and `hipHostMallocPortable`).
|
||||
On MI200 these options do not impact performance.
|
||||
<!-- TODO: link to programming_manual#memory-allocation-flags -->
|
||||
For more information, see the section *memory allocation flags* in the HIP Programming Guide: {doc}`hip:user_guide/programming_manual`.
|
||||
:::
|
||||
|
||||
Much like how a process can be locked to a CPU core by setting affinity, a pinned memory allocator does this with the memory storage system.
|
||||
On multi-socket systems it is important to ensure that pinned memory is located on the same socket as the owning process, or else each cache line will be moved through the CPU-CPU interconnect, thereby increasing latency and potentially decreasing bandwidth.
|
||||
|
||||
In practice, pinned memory is used to improve transfer times between host and device.
|
||||
For transfer operations, such as `hipMemcpy` or `hipMemcpyAsync`, using pinned memory instead of pageable memory on host can lead to a ~3x improvement in bandwidth.
|
||||
|
||||
:::{tip}
|
||||
If the application needs to move data back and forth between device and host (separate allocations), use pinned memory on the host side.
|
||||
:::
|
||||
|
||||
### Managed memory
|
||||
|
||||
Managed memory refers to universally addressable, or unified memory available on the MI200 series of GPUs.
|
||||
Much like pinned memory, managed memory shares a pointer between host and device and (by default) supports fine-grained coherence, however, managed memory can also automatically migrate pages between host and device.
|
||||
The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism.
|
||||
|
||||
If heterogenous memory management (HMM) is not available, then `hipMallocManaged` will default back to using system memory and will act like pinned host memory.
|
||||
Other managed memory API calls will have undefined behavior.
|
||||
It is therefore recommended to check for managed memory capability with: `hipDeviceGetAttribute` and `hipDeviceAttributeManagedMemory`.
|
||||
|
||||
HIP supports additional calls that work with page migration:
|
||||
|
||||
* `hipMemAdvise`
|
||||
* `hipMemPrefetchAsync`
|
||||
|
||||
:::{tip}
|
||||
If the application needs to use data on both host and device regularly, does not want to deal with separate allocations, and is not worried about maxing out the VRAM on MI200 GPUs (64 GB per GCD), use managed memory.
|
||||
:::
|
||||
|
||||
:::{tip}
|
||||
If managed memory performance is poor, check to see if managed memory is supported on your system and if page migration (XNACK) is enabled.
|
||||
:::
|
||||
|
||||
## Access behavior
|
||||
|
||||
Memory allocations for GPUs behave as follow:
|
||||
|
||||
| API | Data location | Host access | Device access |
|
||||
|--------------------|---------------|--------------|----------------------|
|
||||
| System allocated | Host | Local access | Unhandled page fault |
|
||||
| `hipMallocManaged` | Host | Local access | Zero-copy |
|
||||
| `hipHostMalloc` | Host | Local access | Zero-copy* |
|
||||
| `hipMalloc` | Device | Zero-copy | Local access |
|
||||
|
||||
Zero-copy accesses happen over the Infinity Fabric interconnect or PCI-E lanes on discrete GPUs.
|
||||
|
||||
:::{note}
|
||||
While `hipHostMalloc` allocated memory is accessible by a device, the host pointer must be converted to a device pointer with `hipHostGetDevicePointer`.
|
||||
|
||||
Memory allocated through standard system allocators such as `malloc`, can be accessed a device by registering the memory via `hipHostRegister`.
|
||||
The device pointer to be used in kernels can be retrieved with `hipHostGetDevicePointer`.
|
||||
Registered memory is treated like `hipHostMalloc` and will have similar performance.
|
||||
|
||||
On devices that support and have [](#xnack) enabled, such as the MI250X, `hipHostRegister` is not required as memory accesses are handled via automatic page migration.
|
||||
:::
|
||||
|
||||
### XNACK
|
||||
|
||||
Normally, host and device memory are separate and data has to be transferred manually via `hipMemcpy`.
|
||||
|
||||
On a subset of GPUs, such as the MI200, there is an option to automatically migrate pages of memory between host and device.
|
||||
This is important for managed memory, where the locality of the data is important for performance.
|
||||
Depending on the system, page migration may be disabled by default in which case managed memory will act like pinned host memory and suffer degraded performance.
|
||||
|
||||
*XNACK* describes the GPUs ability to retry memory accesses that failed due a page fault (which normally would lead to a memory access error), and instead retrieve the missing page.
|
||||
|
||||
This also affects memory allocated by the system as indicated by the following table:
|
||||
|
||||
| API | Data location | Host after device access | Device after host access |
|
||||
|--------------------|---------------|--------------------------|--------------------------|
|
||||
| System allocated | Host | Migrate page to host | Migrate page to device |
|
||||
| `hipMallocManaged` | Host | Migrate page to host | Migrate page to device |
|
||||
| `hipHostMalloc` | Host | Local access | Zero-copy |
|
||||
| `hipMalloc` | Device | Zero-copy | Local access |
|
||||
|
||||
To check if page migration is available on a platform, use `rocminfo`:
|
||||
|
||||
```sh
|
||||
$ rocminfo | grep xnack
|
||||
Name: amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack-
|
||||
```
|
||||
|
||||
Here, `xnack-` means that XNACK is available but is disabled by default.
|
||||
Turning on XNACK by setting the environment variable `HSA_XNACK=1` and gives the expected result, `xnack+`:
|
||||
|
||||
```sh
|
||||
$ HSA_XNACK=1 rocminfo | grep xnack
|
||||
Name: amdgcn-amd-amdhsa--gfx90a:sramecc+:xnack+
|
||||
```
|
||||
|
||||
`hipcc`by default will generate code that runs correctly with both XNACK enabled or disabled.
|
||||
Setting the `--offload-arch=`-option with `xnack+` or `xnack-` forces code to be only run with XNACK enabled or disabled respectively.
|
||||
|
||||
```sh
|
||||
# Compiled kernels will run regardless if XNACK is enabled or is disabled.
|
||||
hipcc --offload-arch=gfx90a
|
||||
|
||||
# Compiled kernels will only be run if XNACK is enabled with XNACK=1.
|
||||
hipcc --offload-arch=gfx90a:xnack+
|
||||
|
||||
# Compiled kernels will only be run if XNACK is disabled with XNACK=0.
|
||||
hipcc --offload-arch=gfx90a:xnack-
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
If you want to make use of page migration, use managed memory. While pageable memory will migrate correctly, it is not a portable solution and can have performance issues if the accessed data isn't page aligned.
|
||||
:::
|
||||
|
||||
### Coherence
|
||||
|
||||
* *Coarse-grained coherence* means that memory is only considered up to date at kernel boundaries, which can be enforced through `hipDeviceSynchronize`, `hipStreamSynchronize`, or any blocking operation that acts on the null stream (e.g. `hipMemcpy`).
|
||||
For example, cacheable memory is a type of coarse-grained memory where an up-to-date copy of the data can be stored elsewhere (e.g. in an L2 cache).
|
||||
* *Fine-grained coherence* means the coherence is supported while a CPU/GPU kernel is running.
|
||||
This can be useful if both host and device are operating on the same dataspace using system-scope atomic operations (e.g. updating an error code or flag to a buffer).
|
||||
Fine-grained memory implies that up-to-date data may be made visible to others regardless of kernel boundaries as discussed above.
|
||||
|
||||
| API | Flag | Coherence |
|
||||
|-------------------------|------------------------------|----------------|
|
||||
| `hipHostMalloc` | `hipHostMallocDefault` | Fine-grained |
|
||||
| `hipHostMalloc` | `hipHostMallocNonCoherent` | Coarse-grained |
|
||||
|
||||
| API | Flag | Coherence |
|
||||
|-------------------------|------------------------------|----------------|
|
||||
| `hipExtMallocWithFlags` | `hipDeviceMallocDefault` | Coarse-grained |
|
||||
| `hipExtMallocWithFlags` | `hipDeviceMallocFinegrained` | Fine-grained |
|
||||
|
||||
| API | `hipMemAdvise` argument | Coherence |
|
||||
|-------------------------|------------------------------|----------------|
|
||||
| `hipMallocManaged` | | Fine-grained |
|
||||
| `hipMallocManaged` | `hipMemAdviseSetCoarseGrain` | Coarse-grained |
|
||||
| `malloc` | | Fine-grained |
|
||||
| `malloc` | `hipMemAdviseSetCoarseGrain` | Coarse-grained |
|
||||
|
||||
:::{tip}
|
||||
Try to design your algorithms to avoid host-device memory coherence (e.g. system scope atomics). While it can be a useful feature in very specific cases, it is not supported on all systems, and can negatively impact performance by introducing the host-device interconnect bottleneck.
|
||||
:::
|
||||
|
||||
The availability of fine- and coarse-grained memory pools can be checked with `rocminfo`:
|
||||
|
||||
```sh
|
||||
$ rocminfo
|
||||
...
|
||||
*******
|
||||
Agent 1
|
||||
*******
|
||||
Name: AMD EPYC 7742 64-Core Processor
|
||||
...
|
||||
Pool Info:
|
||||
Pool 1
|
||||
Segment: GLOBAL; FLAGS: FINE GRAINED
|
||||
...
|
||||
Pool 3
|
||||
Segment: GLOBAL; FLAGS: COARSE GRAINED
|
||||
...
|
||||
*******
|
||||
Agent 9
|
||||
*******
|
||||
Name: gfx90a
|
||||
...
|
||||
Pool Info:
|
||||
Pool 1
|
||||
Segment: GLOBAL; FLAGS: COARSE GRAINED
|
||||
...
|
||||
```
|
||||
|
||||
## System direct memory access
|
||||
|
||||
In most cases, the default behavior for HIP in transferring data from a pinned host allocation to device will run at the limit of the interconnect.
|
||||
However, there are certain cases where the interconnect is not the bottleneck.
|
||||
|
||||
The primary way to transfer data onto and off of a GPU, such as the MI200, is to use the onboard System Direct Memory Access engine, which is used to feed blocks of memory to the off-device interconnect (either GPU-CPU or GPU-GPU).
|
||||
Each GCD has a separate SDMA engine for host-to-device and device-to-host memory transfers.
|
||||
Importantly, SDMA engines are separate from the computing infrastructure, meaning that memory transfers to and from a device will not impact kernel compute performance, though they do impact memory bandwidth to a limited extent.
|
||||
The SDMA engines are mainly tuned for PCIe-4.0 x16, which means they are designed to operate at bandwidths up to 32 GB/s.
|
||||
|
||||
:::{note}
|
||||
An important feature of the MI250X platform is the Infinity Fabric™ interconnect between host and device.
|
||||
The Infinity Fabric interconnect supports improved performance over standard PCIe-4.0 (usually ~50% more bandwidth); however, since the SDMA engine does not run at this speed, it will not max out the bandwidth of the faster interconnect.
|
||||
:::
|
||||
|
||||
The bandwidth limitation can be countered by bypassing the SDMA engine and replacing it with a type of copy kernel known as a "blit" kernel.
|
||||
Blit kernels will use the compute units on the GPU, thereby consuming compute resources, which may not always be beneficial.
|
||||
The easiest way to enable blit kernels is to set an environment variable `HSA_ENABLE_SDMA=0`, which will disable the SDMA engine.
|
||||
On systems where the GPU uses a PCIe interconnect instead of an Infinity Fabric interconnect, blit kernels will not impact bandwidth, but will still consume compute resources.
|
||||
The use of SDMA vs blit kernels also applies to MPI data transfers and GPU-GPU transfers.
|
||||
250
docs/conceptual/using-gpu-sanitizer.md
Normal file
@@ -0,0 +1,250 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="description" content="Using the LLVM ASan on a GPU">
|
||||
<meta name="keywords" content="LLVM, ASan, address sanitizer, AddressSanitizer, instrumented
|
||||
libraries, instrumented applications, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# Using the LLVM ASan on a GPU (beta release)
|
||||
|
||||
The LLVM AddressSanitizer (ASan) provides a process that allows developers to detect runtime addressing errors in applications and libraries. The detection is achieved using a combination of compiler-added instrumentation and runtime techniques, including function interception and replacement.
|
||||
|
||||
Until now, the LLVM ASan process was only available for traditional purely CPU applications. However, ROCm has extended this mechanism to additionally allow the detection of some addressing errors on the GPU in heterogeneous applications. Ideally, developers should treat heterogeneous HIP and OpenMP applications exactly like pure CPU applications. However, this simplicity has not been achieved yet.
|
||||
|
||||
This document provides documentation on using ROCm ASan.
|
||||
For information about LLVM ASan, see the [LLVM documentation](https://clang.llvm.org/docs/AddressSanitizer.html).
|
||||
|
||||
:::{note}
|
||||
The beta release of LLVM ASan for ROCm is currently tested and validated on Ubuntu 20.04.
|
||||
:::
|
||||
|
||||
## Compiling for ASan
|
||||
|
||||
The ASan process begins by compiling the application of interest with the ASan instrumentation.
|
||||
|
||||
Recommendations for doing this are:
|
||||
|
||||
* Compile as many application and dependent library sources as possible using an AMD-built clang-based compiler such as `amdclang++`.
|
||||
* Add the following options to the existing compiler and linker options:
|
||||
* `-fsanitize=address` - enables instrumentation
|
||||
* `-shared-libsan` - use shared version of runtime
|
||||
* `-g` - add debug info for improved reporting
|
||||
* Explicitly use `xnack+` in the offload architecture option. For example, `--offload-arch=gfx90a:xnack+`
|
||||
Other architectures are allowed, but their device code will not be instrumented and a warning will be emitted.
|
||||
|
||||
It is not an error to compile some files without ASan instrumentation, but doing so reduces the ability of the process to detect addressing errors. However, if the main program "`a.out`" does not directly depend on the ASan runtime (`libclang_rt.asan-x86_64.so`) after the build completes (check by running `ldd` (List Dynamic Dependencies) or `readelf`), the application will immediately report an error at runtime as described in the next section.
|
||||
|
||||
### About compilation time
|
||||
|
||||
When `-fsanitize=address` is used, the LLVM compiler adds instrumentation code around every memory operation. This added code must be handled by all of the downstream components of the compiler toolchain and results in increased overall compilation time. This increase is especially evident in the AMDGPU device compiler and has in a few instances raised the compile time to an unacceptable level.
|
||||
|
||||
There are a few options if the compile time becomes unacceptable:
|
||||
|
||||
* Avoid instrumentation of the files which have the worst compile times. This will reduce the effectiveness of the ASan process.
|
||||
* Add the option `-fsanitize-recover=address` to the compiles with the worst compile times. This option simplifies the added instrumentation resulting in faster compilation. See below for more information.
|
||||
* Disable instrumentation on a per-function basis by adding `__attribute__`((no_sanitize("address"))) to functions found to be responsible for the large compile time. Again, this will reduce the effectiveness of the process.
|
||||
|
||||
## Installing ROCm GPU ASan packages
|
||||
|
||||
For a complete ROCm GPU Sanitizer installation, including packages, instrumented HSA and HIP runtimes, tools, and math libraries, use the following instruction,
|
||||
|
||||
```bash
|
||||
sudo apt-get install rocm-ml-sdk-asan
|
||||
|
||||
```
|
||||
|
||||
## Using AMD-supplied ASan instrumented libraries
|
||||
|
||||
ROCm releases have optional packages that contain additional ASan instrumented builds of the ROCm libraries (usually found in `/opt/rocm-<version>/lib`). The instrumented libraries have identical names to the regular uninstrumented libraries, and are located in `/opt/rocm-<version>/lib/asan`.
|
||||
These additional libraries are built using the `amdclang++` and `hipcc` compilers, while some uninstrumented libraries are built with g++. The preexisting build options are used but, as described above, additional options are used: `-fsanitize=address`, `-shared-libsan` and `-g`.
|
||||
|
||||
These additional libraries avoid additional developer effort to locate repositories, identify the correct branch, check out the correct tags, and other efforts needed to build the libraries from the source. And they extend the ability of the process to detect addressing errors into the ROCm libraries themselves.
|
||||
|
||||
When adjusting an application build to add instrumentation, linking against these instrumented libraries is unnecessary. For example, any `-L` `/opt/rocm-<version>/lib` compiler options need not be changed. However, the instrumented libraries should be used when the application is run. It is particularly important that the instrumented language runtimes, like `libamdhip64.so` and `librocm-core.so`, are used; otherwise, device invalid access detections may not be reported.
|
||||
|
||||
## Running ASan instrumented applications
|
||||
|
||||
### Preparing to run an instrumented application
|
||||
|
||||
Here are a few recommendations to consider before running an ASan instrumented heterogeneous application.
|
||||
|
||||
* Ensure the Linux kernel running on the system has Heterogeneous Memory Management (HMM) support. A kernel version of 5.6 or higher should be sufficient.
|
||||
* Ensure XNACK is enabled
|
||||
* For `gfx90a` (MI-2X0) or `gfx940` (MI-3X0) use environment `HSA_XNACK = 1`.
|
||||
* For `gfx906` (MI-50) or `gfx908` (MI-100) use environment `HSA_XNACK = 1` but also ensure the amdgpu kernel module is loaded with module argument `noretry=0`.
|
||||
This requirement is due to the fact that the XNACK setting for these GPUs is system-wide.
|
||||
|
||||
* Ensure that the application will use the instrumented libraries when it runs. The output from the shell command `ldd <application name>` can be used to see which libraries will be used.
|
||||
If the instrumented libraries are not listed by `ldd`, the environment variable `LD_LIBRARY_PATH` may need to be adjusted, or in some cases an `RPATH` compiled into the application may need to be changed and the application recompiled.
|
||||
|
||||
* Ensure that the application depends on the ASan runtime. This can be checked by running the command `readelf -d <application name> | grep NEEDED` and verifying that shared library: `libclang_rt.asan-x86_64.so` appears in the output.
|
||||
If it does not appear, when executed the application will quickly output an ASan error that looks like:
|
||||
|
||||
```bash
|
||||
==3210==ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
|
||||
```
|
||||
|
||||
* Ensure that the application `llvm-symbolizer` can be executed, and that it is located in `/opt/rocm-<version>/llvm/bin`. This executable is not strictly required, but if found is used to translate ("symbolize") a host-side instruction address into a more useful function name, file name, and line number (assuming the application has been built to include debug information).
|
||||
|
||||
There is an environment variable, `ASAN_OPTIONS`, that can be used to adjust the runtime behavior of the ASAN runtime itself. There are more than a hundred "flags" that can be adjusted (see an old list at [flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)) but the default settings are correct and should be used in most cases. It must be noted that these options only affect the host ASAN runtime. The device runtime only currently supports the default settings for the few relevant options.
|
||||
|
||||
There are two `ASAN_OPTION` flags of particular note.
|
||||
|
||||
* `halt_on_error=0/1 default 1`.
|
||||
|
||||
This tells the ASAN runtime to halt the application immediately after detecting and reporting an addressing error. The default makes sense because the application has entered the realm of undefined behavior. If the developer wishes to have the application continue anyway, this option can be set to zero. However, the application and libraries should then be compiled with the additional option `-fsanitize-recover=address`. Note that the ROCm optional ASan instrumented libraries are not compiled with this option and if an error is detected within one of them, but halt_on_error is set to 0, more undefined behavior will occur.
|
||||
|
||||
* `detect_leaks=0/1 default 1`.
|
||||
This option directs the ASan runtime to enable the [Leak Sanitizer](https://clang.llvm.org/docs/LeakSanitizer.html) (LSAN). Unfortunately, for heterogeneous applications, this default will result in significant output from the leak sanitizer when the application exits due to allocations made by the language runtime which are not considered to be to be leaks. This output can be avoided by adding `detect_leaks=0` to the `ASAN_OPTIONS`, or alternatively by producing an LSAN suppression file (syntax described [here](https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer)) and activating it with environment variable `LSAN_OPTIONS=suppressions=/path/to/suppression/file`. When using a suppression file, a suppression report is printed by default. The suppression report can be disabled by using the `LSAN_OPTIONS` flag `print_suppressions=0`.
|
||||
|
||||
## Runtime overhead
|
||||
|
||||
Running an ASan instrumented application incurs
|
||||
overheads which may result in unacceptably long runtimes
|
||||
or failure to run at all.
|
||||
|
||||
### Higher execution time
|
||||
|
||||
ASan detection works by checking each address at runtime
|
||||
before the address is actually accessed by a load, store, or atomic
|
||||
instruction.
|
||||
This checking involves an additional load to "shadow" memory which
|
||||
records whether the address is "poisoned" or not, and additional logic
|
||||
that decides whether to produce an detection report or not.
|
||||
|
||||
This extra runtime work can cause the application to slow down by
|
||||
a factor of three or more, depending on how many memory accesses are
|
||||
executed.
|
||||
For heterogeneous applications, the shadow memory must be accessible by all devices
|
||||
and this can mean that shadow accesses from some devices may be more costly
|
||||
than non-shadow accesses.
|
||||
|
||||
### Higher memory use
|
||||
|
||||
The address checking described above relies on the compiler to surround
|
||||
each program variable with a red zone and on ASan
|
||||
runtime to surround each runtime memory allocation with a red zone and
|
||||
fill the shadow corresponding to each red zone with poison.
|
||||
The added memory for the red zones is additional overhead on top
|
||||
of the 13% overhead for the shadow memory itself.
|
||||
|
||||
Applications which consume most one or more available memory pools when
|
||||
run normally are likely to encounter allocation failures when run with
|
||||
instrumentation.
|
||||
|
||||
## Runtime reporting
|
||||
|
||||
It is not the intention of this document to provide a detailed explanation of all of the types of reports that can be output by the ASan runtime. Instead, the focus is on the differences between the standard reports for CPU issues, and reports for GPU issues.
|
||||
|
||||
An invalid address detection report for the CPU always starts with
|
||||
|
||||
```bash
|
||||
==<PID>==ERROR: AddressSanitizer: <problem type> on address <memory address> at pc <pc> bp <bp> sp <sp> <access> of size <N> at <memory address> thread T0
|
||||
```
|
||||
|
||||
and continues with a stack trace for the access, a stack trace for the allocation and deallocation, if relevant, and a dump of the shadow near the <memory address>.
|
||||
|
||||
In contrast, an invalid address detection report for the GPU always starts with
|
||||
|
||||
```bash
|
||||
==<PID>==ERROR: AddressSanitizer: <problem type> on amdgpu device <device> at pc <pc> <access> of size <n> in workgroup id (<X>,<Y>,<Z>)
|
||||
```
|
||||
|
||||
Above, `<device>` is the integer device ID, and `(<X>, <Y>, <Z>)` is the ID of the workgroup or block where the invalid address was detected.
|
||||
|
||||
While the CPU report include a call stack for the thread attempting the invalid access, the GPU is currently to a call stack of size one, i.e. the (symbolized) of the invalid access, e.g.
|
||||
|
||||
```bash
|
||||
#0 <pc> in <fuction signature> at /path/to/file.hip:<line>:<column>
|
||||
```
|
||||
|
||||
This short call stack is followed by a GPU unique section that looks like
|
||||
|
||||
```bash
|
||||
Thread ids and accessed addresses:
|
||||
<lid0> <maddr 0> : <lid1> <maddr1> : ...
|
||||
```
|
||||
|
||||
where each `<lid j> <maddr j>` indicates the lane ID and the invalid memory address held by lane `j` of the wavefront attempting the invalid access.
|
||||
|
||||
Additionally, reports for invalid GPU accesses to memory allocated by GPU code via `malloc` or new starting with, for example,
|
||||
|
||||
```bash
|
||||
==1234==ERROR: AddressSanitizer: heap-buffer-overflow on amdgpu device 0 at pc 0x7fa9f5c92dcc
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
==5678==ERROR: AddressSanitizer: heap-use-after-free on amdgpu device 3 at pc 0x7f4c10062d74
|
||||
```
|
||||
|
||||
currently may include one or two surprising CPU side tracebacks mentioning :`hostcall`". This is due to how `malloc` and `free` are implemented for GPU code and these call stacks can be ignored.
|
||||
|
||||
### Running with `rocgdb`
|
||||
|
||||
`rocgdb` can be used to further investigate ASan detected errors, with some preparation.
|
||||
|
||||
Currently, the ASan runtime complains when starting `rocgdb` without preparation.
|
||||
|
||||
```bash
|
||||
$ rocgdb my_app
|
||||
==1122==ASan` runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.
|
||||
```
|
||||
|
||||
This is solved by setting environment variable `LD_PRELOAD` to the path to the ASan runtime, whose path can be obtained using the command
|
||||
|
||||
```bash
|
||||
amdclang++ -print-file-name=libclang_rt.asan-x86_64.so
|
||||
```
|
||||
|
||||
It is also recommended to set the environment variable `HIP_ENABLE_DEFERRED_LOADING=0` before debugging HIP applications.
|
||||
|
||||
After starting `rocgdb` breakpoints can be set on the ASan runtime error reporting entry points of interest. For example, if an ASan error report includes
|
||||
|
||||
```bash
|
||||
WRITE of size 4 in workgroup id (10,0,0)
|
||||
```
|
||||
|
||||
the `rocgdb` command needed to stop the program before the report is printed is
|
||||
|
||||
```bash
|
||||
(gdb) break __asan_report_store4
|
||||
```
|
||||
|
||||
Similarly, the appropriate command for a report including
|
||||
|
||||
```bash
|
||||
READ of size <N> in workgroup ID (1,2,3)
|
||||
```
|
||||
|
||||
is
|
||||
|
||||
```bash
|
||||
(gdb) break __asan_report_load<N>
|
||||
```
|
||||
|
||||
It is possible to set breakpoints on all ASan report functions using these commands:
|
||||
|
||||
```bash
|
||||
$ rocgdb <path to application>
|
||||
(gdb) start <commmand line arguments>
|
||||
(gdb) rbreak ^__asan_report
|
||||
(gdb) c
|
||||
```
|
||||
|
||||
### Using ASan with a short HIP application
|
||||
|
||||
Refer to the following example to use ASan with a short HIP application,
|
||||
|
||||
https://github.com/Rmalavally/rocm-examples/blob/Rmalavally-patch-1/LLVM_ASAN/Using-Address-Sanitizer-with-a-Short-HIP-Application.md
|
||||
|
||||
### Known issues with using GPU sanitizer
|
||||
|
||||
* Red zones must have limited size and it is possible for an invalid access to completely miss a red zone and not be detected.
|
||||
|
||||
* Lack of detection or false reports can be caused by the runtime not properly maintaining red zone shadows.
|
||||
|
||||
* Lack of detection on the GPU might also be due to the implementation not instrumenting accesses to all GPU specific address spaces. For example, in the current implementation accesses to "private" or "stack" variables on the GPU are not instrumented, and accesses to HIP shared variables (also known as "local data store" or "LDS") are also not instrumented.
|
||||
|
||||
* It can also be the case that a memory fault is hit for an invalid address even with the instrumentation. This is usually caused by the invalid address being so wild that its shadow address is outside of any memory region, and the fault actually occurs on the access to the shadow address. It is also possible to hit a memory fault for the `NULL` pointer. While address 0 does have a shadow location, it is not poisoned by the runtime.
|
||||
288
docs/conf.py
@@ -4,72 +4,26 @@
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from subprocess import run
|
||||
import jinja2
|
||||
import os
|
||||
|
||||
gh_release_path = os.path.join("..", "RELEASE.md")
|
||||
gh_changelog_path = os.path.join("..", "CHANGELOG.md")
|
||||
sphinx_release_path = os.path.join("about", "release-notes.md")
|
||||
sphinx_changelog_path = os.path.join("release", "changelog.md")
|
||||
shutil.copy2(gh_release_path, sphinx_release_path)
|
||||
shutil.copy2(gh_changelog_path, sphinx_changelog_path)
|
||||
# Environment to process Jinja templates.
|
||||
jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
|
||||
|
||||
# Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
|
||||
with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
|
||||
content = file.read()
|
||||
file.seek(0)
|
||||
file.write(":orphan:\n" + content)
|
||||
# Jinja templates to render out.
|
||||
templates = []
|
||||
|
||||
# Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
|
||||
with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
|
||||
lines = file.readlines()
|
||||
# Render templates and output files without the last extension.
|
||||
# For example: 'install.md.jinja' becomes 'install.md'.
|
||||
for template in templates:
|
||||
rendered = jinja_env.get_template(template).render()
|
||||
with open(os.path.splitext(template)[0], 'w') as file:
|
||||
file.write(rendered)
|
||||
|
||||
modified_lines = []
|
||||
in_admonition_section = False
|
||||
|
||||
# Map for matching the specific admonition type to its corresponding Sphinx markdown syntax
|
||||
admonition_types = {
|
||||
'> [!NOTE]': '```{note}',
|
||||
'> [!TIP]': '```{tip}',
|
||||
'> [!IMPORTANT]': '```{important}',
|
||||
'> [!WARNING]': '```{warning}',
|
||||
'> [!CAUTION]': '```{caution}'
|
||||
}
|
||||
|
||||
for line in lines:
|
||||
if any(line.startswith(k) for k in admonition_types):
|
||||
for key in admonition_types:
|
||||
if(line.startswith(key)):
|
||||
modified_lines.append(admonition_types[key] + '\n')
|
||||
break
|
||||
in_admonition_section = True
|
||||
elif in_admonition_section:
|
||||
if line.strip() == '':
|
||||
# If we encounter an empty line, close the admonition section
|
||||
modified_lines.append('```\n\n') # Close the admonition block
|
||||
in_admonition_section = False
|
||||
else:
|
||||
modified_lines.append(line.lstrip('> '))
|
||||
else:
|
||||
modified_lines.append(line)
|
||||
|
||||
# In case the file ended while still in a admonition section, close it
|
||||
if in_admonition_section:
|
||||
modified_lines.append('```')
|
||||
|
||||
file.close()
|
||||
|
||||
with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
|
||||
file.writelines(modified_lines)
|
||||
|
||||
matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
|
||||
rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
|
||||
if not os.path.exists(rtd_path):
|
||||
os.makedirs(rtd_path)
|
||||
shutil.copy2(matrix_path, rtd_path)
|
||||
shutil.copy2('../RELEASE.md','./about/release-notes.md')
|
||||
# Keep capitalization due to similar linking on GitHub's markdown preview.
|
||||
shutil.copy2('../CHANGELOG.md','./about/CHANGELOG.md')
|
||||
|
||||
latex_engine = "xelatex"
|
||||
latex_elements = {
|
||||
@@ -80,204 +34,72 @@ latex_elements = {
|
||||
"""
|
||||
}
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
|
||||
html_context = {"docs_header_version": "7.1.1"}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
# Check if the branch is a docs/ branch
|
||||
official_branch = run(["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True).stdout.find("docs/")
|
||||
|
||||
# configurations for PDF output by Read the Docs
|
||||
project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.2.0"
|
||||
release = "7.2.0"
|
||||
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "6.0.1"
|
||||
release = "6.0.1"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2026-01-21"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/verl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
{
|
||||
"file":"about/release-notes",
|
||||
"os":["linux", "windows"],
|
||||
"date":"2024-01-31"
|
||||
},
|
||||
{
|
||||
"file":"about/CHANGELOG",
|
||||
"os":["linux", "windows"],
|
||||
"date":"2024-01-31"
|
||||
},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
|
||||
{"file":"install/windows/install-quick", "os":["windows"]},
|
||||
{"file":"install/linux/install-quick", "os":["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
{"file":"install/linux/install", "os":["linux"]},
|
||||
{"file":"install/linux/install-options", "os":["linux"]},
|
||||
{"file":"install/linux/prerequisites", "os":["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file":"install/docker", "os":["linux"]},
|
||||
{"file":"install/magma-install", "os":["linux"]},
|
||||
{"file":"install/pytorch-install", "os":["linux"]},
|
||||
{"file":"install/tensorflow-install", "os":["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm-mori-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang-mori-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
|
||||
{"file":"install/windows/install", "os":["windows"]},
|
||||
{"file":"install/windows/prerequisites", "os":["windows"]},
|
||||
{"file":"install/windows/cli/index", "os":["windows"]},
|
||||
{"file":"install/windows/gui/index", "os":["windows"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
{"file":"about/compatibility/docker-image-support-matrix", "os":["linux"]},
|
||||
{"file":"about/compatibility/user-kernel-space-compat-matrix", "os":["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
|
||||
{"file":"reference/library-index", "os":["linux"]},
|
||||
|
||||
{"file": "how-to/system-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi100", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
|
||||
{"file": "how-to/system-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
|
||||
{"file":"how-to/deep-learning-rocm", "os":["linux"]},
|
||||
{"file":"how-to/gpu-enabled-mpi", "os":["linux"]},
|
||||
{"file":"how-to/system-debugging", "os":["linux"]},
|
||||
{"file":"how-to/tuning-guides", "os":["linux", "windows"]},
|
||||
|
||||
{"file":"rocm-a-z", "os":["linux", "windows"]},
|
||||
]
|
||||
|
||||
exclude_patterns = ['temp']
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "remote-content", "version-ref", "csv-to-list-table"]
|
||||
|
||||
compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')
|
||||
extensions = ["rocm_docs"]
|
||||
|
||||
external_projects_current_project = "rocm"
|
||||
|
||||
# Uncomment if facing rate limit exceed issue with local build
|
||||
# external_projects_remote_repository = ""
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
html_context = {"docs_header_version": "7.1.0"}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
html_context["official_branch"] = official_branch
|
||||
html_context["version"] = version
|
||||
html_context["release"] = release
|
||||
|
||||
html_theme = "rocm_docs_theme"
|
||||
html_theme_options = {"flavor": "rocm-docs-home"}
|
||||
|
||||
html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference"]
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
|
||||
html_js_files = ["vllm-benchmark.js"]
|
||||
|
||||
html_title = "ROCm Documentation"
|
||||
|
||||
html_theme_options = {"link_main_doc": False}
|
||||
|
||||
redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
|
||||
|
||||
numfig = False
|
||||
suppress_warnings = ["autosectionlabel.*"]
|
||||
|
||||
html_context = {
|
||||
"project_path" : {project_path},
|
||||
"gpu_type" : [('AMD Instinct GPUs', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
|
||||
"atomics_type" : [('HW atomics', 'hw-atomics'), ('CAS emulation', 'cas-atomics')],
|
||||
"pcie_type" : [('No PCIe atomics', 'nopcie'), ('PCIe atomics', 'pcie')],
|
||||
"memory_type" : [('Device DRAM', 'device-dram'), ('Migratable Host DRAM', 'migratable-host-dram'), ('Pinned Host DRAM', 'pinned-host-dram')],
|
||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||
html_theme_options = {
|
||||
"link_main_doc": False
|
||||
}
|
||||
|
||||
# Disable figure and table numbering
|
||||
numfig = False
|
||||
|
||||
@@ -7,52 +7,34 @@
|
||||
|
||||
# Building documentation
|
||||
|
||||
You can build our documentation via GitHub (in a pull request) or locally (using the command line or
|
||||
Visual Studio (VS) Code.
|
||||
|
||||
## GitHub
|
||||
|
||||
If you open a pull request and scroll down to the summary panel,
|
||||
there is a commit status section. Next to the line
|
||||
`docs/readthedocs.com:advanced-micro-devices-demo`, there is a `Details` link.
|
||||
If you click this, it takes you to the Read the Docs build for your pull request.
|
||||
If you open a pull request on the `develop` branch of a ROCm repository and scroll to the bottom of
|
||||
the page, there is a summary panel. Next to the line
|
||||
`docs/readthedocs.com:advanced-micro-devices-demo`, there is a `Details` link. If you click this, it takes
|
||||
you to the Read the Docs build for your pull request.
|
||||
|
||||

|
||||

|
||||
|
||||
If you don't see this line, click `Show all checks` to get an itemized view.
|
||||
|
||||
## Command line
|
||||
|
||||
You can build our documentation via the command line using Python.
|
||||
|
||||
See the `build.tools.python` setting in the [Read the Docs configuration file](https://github.com/ROCm/ROCm/blob/develop/.readthedocs.yaml) for the Python version used by Read the Docs to build documentation.
|
||||
|
||||
See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/docs/sphinx/requirements.txt) for Python packages needed to build the documentation.
|
||||
You can build our documentation via the command line using Python. We use Python 3.8; other
|
||||
versions may not support the build.
|
||||
|
||||
Use the Python Virtual Environment (`venv`) and run the following commands from the project root:
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Linux and WSL
|
||||
:sync: linux
|
||||
|
||||
```sh
|
||||
python3 -mvenv .venv
|
||||
|
||||
.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
|
||||
.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
|
||||
.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
|
||||
.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
|
||||
```
|
||||
|
||||
:::
|
||||
:::{tab-item} Windows
|
||||
:sync: windows
|
||||
|
||||
```powershell
|
||||
python -mvenv .venv
|
||||
|
||||
.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
|
||||
.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
|
||||
```
|
||||
|
||||
:::
|
||||
::::
|
||||
|
||||
Navigate to `_build/html/index.html` and open this file in a web browser.
|
||||
|
||||
## Visual Studio Code
|
||||
|
||||
@@ -4,74 +4,119 @@
|
||||
<meta name="keywords" content="ROCm, contributing, contribute, maintainer, contributor">
|
||||
</head>
|
||||
|
||||
# Contributing to the ROCm documentation
|
||||
# Contribute to ROCm documentation
|
||||
|
||||
The ROCm documentation, like all of ROCm, is open source and available on GitHub. You can contribute to the ROCm documentation by forking the appropriate repository, making your changes, and opening a pull request.
|
||||
All ROCm projects are GitHub-based, so if you want to contribute, you can do so by:
|
||||
|
||||
To provide feedback on the ROCm documentation, including submitting an issue or suggesting a feature, see [Providing feedback about the ROCm documentation](./feedback.md).
|
||||
|
||||
## The ROCm repositories
|
||||
|
||||
The repositories for ROCm and all ROCm components are available on GitHub.
|
||||
|
||||
| Module | Documentation location |
|
||||
| --- | --- |
|
||||
| ROCm framework | [https://github.com/ROCm/ROCm/tree/develop/docs](https://github.com/ROCm/ROCm/tree/develop/docs) |
|
||||
| ROCm installation for Linux | [https://github.com/ROCm/rocm-install-on-linux/tree/develop/docs](https://github.com/ROCm/rocm-install-on-linux/tree/develop/docs) |
|
||||
| ROCm HIP SDK installation for Windows | [https://github.com/ROCm/rocm-install-on-windows/tree/develop/docs](https://github.com/ROCm/rocm-install-on-windows/tree/develop/docs) |
|
||||
|
||||
Individual components have their own repositories with their own documentation in their own `docs` folders.
|
||||
|
||||
The sub-folders within the `docs` folders across ROCm are typically structured as follows:
|
||||
|
||||
| Sub-folder name | Documentation type |
|
||||
|-------|----------|
|
||||
| `install` | Installation instructions, build instructions, and prerequisites |
|
||||
| `conceptual` | Important concepts |
|
||||
| `how-to` | How to implement specific use cases |
|
||||
| `tutorials` | Tutorials |
|
||||
| `reference` | API references and other reference resources |
|
||||
|
||||
## Editing and adding to the documentation
|
||||
|
||||
ROCm documentation follows the [Google developer documentation style guide](https://developers.google.com/style/highlights).
|
||||
|
||||
Most topics in the ROCm documentation are written in [reStructuredText (rst)](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), with some topics written in Markdown. Only use reStructuredText when adding new topics. Only use Markdown if the topic you are editing is already in Markdown.
|
||||
|
||||
To edit or add to the documentation:
|
||||
|
||||
1. Fork the repository you want to add to or edit.
|
||||
2. Clone your fork locally.
|
||||
3. Create a new local branch cut from the `develop` branch of the repository.
|
||||
4. Make your changes to the documentation.
|
||||
|
||||
5. Optionally, build the documentation locally before creating a pull request by running the following commands from within the `docs` folder:
|
||||
|
||||
```bash
|
||||
pip3 install -r sphinx/requirements.txt # You only need to run this command once
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
The output files will be located in the `docs/_build` folder. Open `docs/_build/html/index.html` to view the documentation.
|
||||
|
||||
For more information on ROCm build tools, see [Documentation toolchain](toolchain.md).
|
||||
6. Push your changes. A GitHub link will be returned in the output of the `git push` command. Open this link in a browser to create the pull request.
|
||||
|
||||
The documentation is built as part of the checks on pull request, along with spell checking and linting. Scroll to the bottom of your pull request to view all the checks.
|
||||
|
||||
Verify that the linting and spell checking have passed, and that the documentation was built successfully. New words or acronyms can be added to the [wordlist file](https://github.com/ROCm/rocm-docs-core/blob/develop/.wordlist.txt). The wordlist is subject to approval by the ROCm documentation team.
|
||||
|
||||
The Read The Docs build of your pull request can be accessed by clicking on the Details link next to the Read The Docs build check. Verify that your changes are in the build and look as expected.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
Your pull request will be reviewed by a member of the ROCm documentation team.
|
||||
|
||||
See the [GitHub documentation](https://docs.github.com/en) for information on how to fork and clone a repository, and how to create and push a local branch.
|
||||
* [Submitting a pull request in the appropriate GitHub repository](#submit-a-pull-request)
|
||||
* [Creating an issue in the appropriate GitHub repository](#create-an-issue)
|
||||
* [Suggesting a new feature](#suggest-a-new-feature)
|
||||
|
||||
```{important}
|
||||
By creating a pull request (PR), you agree to allow your contribution to be licensed under the terms of the
|
||||
LICENSE.txt file in the corresponding repository. Different repositories can use different licenses.
|
||||
LICENSE.txt file in the corresponding repository. Different repositories may use different licenses.
|
||||
```
|
||||
|
||||
## Submit a pull request
|
||||
|
||||
To make edits to our documentation via PR, follow these steps:
|
||||
|
||||
1. Identify the repository and the file you want to update. For example, to update this page, you would
|
||||
need to modify content located in this file:
|
||||
`https://github.com/ROCm/ROCm/blob/develop/docs/contribute/contributing.md`
|
||||
|
||||
2. (optional, but recommended) Fork the repository.
|
||||
|
||||
3. Clone the repository locally and (optionally) add your fork. Select the green 'Code' button and copy
|
||||
the URL (e.g., `git@github.com:ROCm/ROCm.git`).
|
||||
|
||||
* From your terminal, run:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:ROCm/ROCm.git
|
||||
```
|
||||
|
||||
* Add your fork to this local copy of the repository. Run:
|
||||
|
||||
```bash
|
||||
git add remote <name-of-my-fork> <git@github.com:my-username/ROCm.git>
|
||||
```
|
||||
|
||||
To get the URL of your fork, go to your GitHub profile, select the fork and click the green 'Code'
|
||||
button (the same process you followed to get the main GitHub repository URL).
|
||||
|
||||
4. Check out the **develop** branch and run 'git pull' (and/or 'git pull origin develop' to ensure your
|
||||
local version has the most recent content.
|
||||
|
||||
5. Create a new branch.
|
||||
|
||||
```bash
|
||||
git checkout -b my-new-branch
|
||||
```
|
||||
|
||||
6. Make your changes locally using your preferred code editor. Follow the guidelines listed on the
|
||||
[documentation structure](./doc-structure.md) page.
|
||||
|
||||
7. (optional) We recommend running a local test build to ensure the content looks the way you expect.
|
||||
|
||||
In your terminal, run the following commands from within your cloned repository:
|
||||
|
||||
```bash
|
||||
cd docs/ # The other commands are run from within the ./docs folder
|
||||
|
||||
pip3 install -r sphinx/requirements.txt # You only need to run this command once
|
||||
|
||||
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
|
||||
```
|
||||
|
||||
The build files are located in the `docs/_build` folder. To preview your build, open the index file
|
||||
(`docs/_build/html/index.html`) file. For more information, see
|
||||
[Building documentation](building.md). To learn
|
||||
more about our build tools, see
|
||||
[Documentation toolchain](toolchain.md).
|
||||
|
||||
8. Commit your changes and push them to GitHub. Run:
|
||||
|
||||
```bash
|
||||
git add <path-to-my-modified-file> # To add all modified files, you can use: git add .
|
||||
|
||||
git commit -m "my-updates"
|
||||
|
||||
git push <name-of-my-fork>
|
||||
```
|
||||
|
||||
After pushing, you will get a GitHub link in the terminal output. Copy this link and paste it into a
|
||||
browser to create your PR.
|
||||
|
||||
## Create an issue
|
||||
|
||||
1. To create a new GitHub issue, select the 'Issues' tab in the appropriate repository
|
||||
(e.g., https://github.com/ROCm/ROCm/issues).
|
||||
2. Use the search bar to make sure the issue doesn't already exist.
|
||||
3. If your issue is not already listed, select the green 'New issue' button to the right of the page. Select
|
||||
the type of issue and fill in the resulting template.
|
||||
|
||||
### General issue guidelines
|
||||
|
||||
* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
|
||||
comment or post to provide additional details, such as how you reproduced this issue.
|
||||
* If you're not sure if your issue is the same, err on the side of caution and file your issue.
|
||||
You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
|
||||
your issue as being the same as the existing issue, we'll close the duplicate.
|
||||
* If your issue doesn't exist, use the issue template to file a new issue.
|
||||
* When filing an issue, be sure to provide as much information as possible, including script output so
|
||||
we can collect information about your configuration. This helps reduce the time required to
|
||||
reproduce your issue.
|
||||
* Check your issue regularly, as we may require additional information to successfully reproduce the
|
||||
issue.
|
||||
|
||||
## Suggest a new feature
|
||||
|
||||
Use the [GitHub Discussion forum](https://github.com/ROCm/ROCm/discussions)
|
||||
(Ideas category) to propose new features. Our maintainers are happy to provide direction and
|
||||
feedback on feature development.
|
||||
|
||||
## Future development workflow
|
||||
|
||||
The current ROCm development workflow is GitHub-based. If, in the future, we change this platform,
|
||||
the tools and links may change. In this instance, we will update contribution guidelines accordingly.
|
||||
|
||||
219
docs/contribute/doc-structure.md
Normal file
@@ -0,0 +1,219 @@
|
||||
# Documentation structure
|
||||
|
||||
Our documentation follows the Pitchfork folder structure. Most documentation files are stored in the
|
||||
`/docs` folder. Some special files (such as release, contributing, and changelog) are stored in the root
|
||||
(`/`) folder.
|
||||
|
||||
All images are stored in the `/docs/data` folder. An image's file path mirrors that of the documentation
|
||||
file where it is used.
|
||||
|
||||
Our naming structure uses kebab case; for example, `my-file-name.rst`.
|
||||
|
||||
## Supported formats and syntax
|
||||
|
||||
Our documentation includes both Markdown and RST files. We are gradually transitioning existing
|
||||
Markdown to RST in order to more effectively meet our documentation needs. When contributing,
|
||||
RST is preferred; if you must use Markdown, use GitHub-flavored Markdown.
|
||||
|
||||
We use [Sphinx Design](https://sphinx-design.readthedocs.io/en/latest/index.html) syntax and compile
|
||||
our API references using [Doxygen](https://www.doxygen.nl/).
|
||||
|
||||
The following table shows some common documentation components and the syntax convention we
|
||||
use for each:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>RST syntax</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Code blocks</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
.. code-block:: language-name
|
||||
|
||||
My code block.
|
||||
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cross-referencing internal files</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
:doc:`Title <../path/to/file/filename>`
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>External links</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
`link name <URL>`_
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<tr>
|
||||
<td>Headings</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
******************
|
||||
Chapter title (H1)
|
||||
******************
|
||||
|
||||
Section title (H2)
|
||||
===============
|
||||
|
||||
Subsection title (H3)
|
||||
---------------------
|
||||
|
||||
Sub-subsection title (H4)
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Images</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
.. image:: image1.png
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Internal links</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
1. Add a tag to the section you want to reference:
|
||||
|
||||
.. _my-section-tag: section-1
|
||||
|
||||
Section 1
|
||||
==========
|
||||
|
||||
2. Link to your tag:
|
||||
|
||||
As shown in :ref:`section-1`.
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<tr>
|
||||
<td>Lists</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
# Ordered (numbered) list item
|
||||
|
||||
* Unordered (bulleted) list item
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<tr>
|
||||
<td>Math (block)</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
.. math::
|
||||
|
||||
A = \begin{pmatrix}
|
||||
0.0 & 1.0 & 1.0 & 3.0 \\
|
||||
4.0 & 5.0 & 6.0 & 7.0 \\
|
||||
\end{pmatrix}
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Math (inline)</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
:math:`2 \times 2 `
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Notes</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
.. note::
|
||||
|
||||
My note here.
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Tables</td>
|
||||
<td>
|
||||
|
||||
```rst
|
||||
|
||||
.. csv-table:: Optional title here
|
||||
:widths: 30, 70 #optional column widths
|
||||
:header: "entry1 header", "entry2 header"
|
||||
|
||||
"entry1", "entry2"
|
||||
|
||||
```
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Language and style
|
||||
|
||||
We use the
|
||||
[Google developer documentation style guide](https://developers.google.com/style/highlights) to
|
||||
guide our content.
|
||||
|
||||
Font size and type, page layout, white space control, and other formatting
|
||||
details are controlled via
|
||||
[rocm-docs-core](https://github.com/ROCm/rocm-docs-core). If you want to notify us
|
||||
of any formatting issues, create a pull request in our
|
||||
[rocm-docs-core](https://github.com/ROCm/rocm-docs-core) GitHub repository.
|
||||
|
||||
## Building our documentation
|
||||
|
||||
<!-- % TODO: Fix the link to be able to work at every files -->
|
||||
To learn how to build our documentation, refer to
|
||||
[Building documentation](./building.md).
|
||||
@@ -4,24 +4,33 @@
|
||||
<meta name="keywords" content="documentation, pull request, GitHub, AMD, ROCm">
|
||||
</head>
|
||||
|
||||
# Providing feedback about the ROCm documentation
|
||||
# Providing feedback
|
||||
|
||||
Feedback about the ROCm documentation is welcome. You can provide feedback about the ROCm documentation either through GitHub Discussions or GitHub Issues.
|
||||
There are four standard ways to provide feedback on this repository.
|
||||
|
||||
## Participating in discussions through GitHub Discussions
|
||||
## Pull request
|
||||
|
||||
You can ask questions, view announcements, suggest new features, and communicate with other members of the community through [GitHub Discussions](https://github.com/ROCm/ROCm/discussions).
|
||||
All contributions to ROCm documentation should arrive via the
|
||||
[GitHub Flow](https://docs.github.com/en/get-started/quickstart/github-flow)
|
||||
targeting the develop branch of the repository. If you are unable to contribute
|
||||
via the GitHub Flow, feel free to email us at [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
|
||||
|
||||
## Submitting issues through GitHub Issues
|
||||
For more in-depth information on creating a pull request (PR), see
|
||||
[Contributing](./contributing.md).
|
||||
|
||||
You can submit issues through [GitHub Issues](https://github.com/ROCm/ROCm/issues).
|
||||
## GitHub discussions
|
||||
|
||||
When creating a new issue, follow the following guidelines:
|
||||
To ask questions or view answers to frequently asked questions, refer to
|
||||
[GitHub Discussions](https://github.com/ROCm/ROCm/discussions).
|
||||
On GitHub Discussions, in addition to asking and answering questions,
|
||||
members can share updates, have open-ended conversations,
|
||||
and follow along on via public announcements.
|
||||
|
||||
1. Always do a search to see if the same issue already exists. If the issue already exists, upvote it, and comment or post to provide any additional details you might have.
|
||||
2. If you find an issue that is similar to your issue, log your issue, then add a comment that includes a link to the similar issue, as well as its issue number.
|
||||
3. Always provide as much information as possible. This helps reduce the time required to reproduce the issue.
|
||||
## GitHub issue
|
||||
|
||||
After creating your issue, make sure to check it regularly for any requests for additional information.
|
||||
Issues on existing or absent documentation can be filed in
|
||||
[GitHub Issues](https://github.com/ROCm/ROCm/issues).
|
||||
|
||||
For information about contributing content to the ROCm documentation, see [Contributing to the ROCm documentation](./contributing.md).
|
||||
## Email
|
||||
|
||||
Send other feedback or questions to [rocm-feedback@amd.com](mailto:rocm-feedback@amd.com?subject=Documentation%20Feedback).
|
||||
|
||||
@@ -6,41 +6,60 @@
|
||||
|
||||
# ROCm documentation toolchain
|
||||
|
||||
The ROCm documentation relies on several open source toolchains and sites.
|
||||
Our documentation relies on several open source toolchains and sites.
|
||||
|
||||
## rocm-docs-core
|
||||
## `rocm-docs-core`
|
||||
|
||||
[rocm-docs-core](https://github.com/ROCm/rocm-docs-core) is an AMD-maintained
|
||||
project that applies customizations for the ROCm documentation. This project is the tool most ROCm repositories use as part of their documentation build pipeline. It is available as a [pip package on PyPI](https://pypi.org/project/rocm-docs-core/).
|
||||
project that applies customization for our documentation. This project is the tool most ROCm
|
||||
repositories use as part of the documentation build. It is also available as a
|
||||
[pip package on PyPI](https://pypi.org/project/rocm-docs-core/).
|
||||
|
||||
See the user and developer guides for rocm-docs-core at
|
||||
{doc}`rocm-docs-core documentation<rocm-docs-core:index>`.
|
||||
|
||||
## Sphinx
|
||||
|
||||
[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator originally used for Python. It is now widely used in the open source community.
|
||||
[Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator originally used for
|
||||
Python. It is now widely used in the open source community.
|
||||
|
||||
### Sphinx External ToC
|
||||
|
||||
[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html) is a Sphinx extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
|
||||
[Sphinx External ToC](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html) is a Sphinx
|
||||
extension used for ROCm documentation navigation. This tool generates a navigation menu on the left
|
||||
based on a YAML file (`_toc.yml.in`) that contains the table of contents.
|
||||
|
||||
### Sphinx-book-theme
|
||||
|
||||
[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme that defines the base appearance for ROCm documentation. ROCm documentation applies some customization, such as a custom header and footer, on top of the Sphinx Book Theme.
|
||||
[Sphinx-book-theme](https://sphinx-book-theme.readthedocs.io/en/latest/) is a Sphinx theme that
|
||||
defines the base appearance for ROCm documentation. ROCm documentation applies some
|
||||
customization, such as a custom header and footer on top of the Sphinx Book Theme.
|
||||
|
||||
### Sphinx Design
|
||||
|
||||
[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that adds design functionality. ROCm documentation uses Sphinx Design for grids, cards, and synchronized tabs.
|
||||
[Sphinx design](https://sphinx-design.readthedocs.io/en/latest/index.html) is a Sphinx extension that
|
||||
adds design functionality. ROCm documentation uses Sphinx Design for grids, cards, and synchronized
|
||||
tabs.
|
||||
|
||||
## Doxygen
|
||||
|
||||
[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts information from in-code comments. It is used for API documentation.
|
||||
[Doxygen](https://www.doxygen.nl/) is a documentation generator that extracts information from inline
|
||||
code. ROCm projects typically use Doxygen for public API documentation (unless the upstream project
|
||||
uses a different tool).
|
||||
|
||||
## Breathe
|
||||
|
||||
[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin for integrating Doxygen content.
|
||||
[Breathe](https://www.breathe-doc.org/) is a Sphinx plugin to integrate Doxygen content.
|
||||
|
||||
## MyST
|
||||
|
||||
[Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended flavor of
|
||||
Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and
|
||||
Sphinx. It's integrated into ROCm documentation by the Sphinx extension
|
||||
[`myst-parser`](https://myst-parser.readthedocs.io/en/latest/).
|
||||
A MyST syntax cheat sheet is available on the [Jupyter reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html) site.
|
||||
|
||||
## Read the Docs
|
||||
|
||||
[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds and hosts the HTML version of the ROCm documentation.
|
||||
[Read the Docs](https://docs.readthedocs.io/en/stable/) is the service that builds and hosts the HTML
|
||||
documentation generated using Sphinx to our end users.
|
||||
|
||||
|
Before Width: | Height: | Size: 114 KiB After Width: | Height: | Size: 81 KiB |
|
Before Width: | Height: | Size: 83 KiB After Width: | Height: | Size: 83 KiB |
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 39 KiB |
|
Before Width: | Height: | Size: 64 KiB |
|
Before Width: | Height: | Size: 98 KiB |
BIN
docs/data/how-to/gpu-enabled-mpi-1.png
Normal file
|
After Width: | Height: | Size: 13 KiB |
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 112 KiB |
|
Before Width: | Height: | Size: 188 KiB |
|
Before Width: | Height: | Size: 138 KiB |
|
Before Width: | Height: | Size: 62 KiB |
|
Before Width: | Height: | Size: 27 KiB |
|
Before Width: | Height: | Size: 86 KiB |
|
Before Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 28 KiB |
|
Before Width: | Height: | Size: 43 KiB |
|
Before Width: | Height: | Size: 25 KiB |
BIN
docs/data/how-to/magma005.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
|
Before Width: | Height: | Size: 139 KiB |
|
Before Width: | Height: | Size: 30 KiB |
@@ -1,91 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
@@ -1,188 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||
PyTorch: 2.7.0+gitf717b2a
|
||||
hipBLASLt: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 4096
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_seq_len_to_capture: 65536
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 131072
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 32768
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_seq_len_to_capture: 16384
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,316 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,316 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,159 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.7.3
|
||||
pytorch_version: 2.7.0 (dev nightly)
|
||||
hipblaslt_version: 0.13
|
||||
model_groups:
|
||||
- group: Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- group: JAIS
|
||||
tag: jais
|
||||
models:
|
||||
- model: JAIS 13B
|
||||
mad_tag: pyt_vllm_jais-13b
|
||||
model_repo: core42/jais-13b-chat
|
||||
url: https://huggingface.co/core42/jais-13b-chat
|
||||
precision: float16
|
||||
- model: JAIS 30B
|
||||
mad_tag: pyt_vllm_jais-30b
|
||||
model_repo: core42/jais-30b-chat-v3
|
||||
url: https://huggingface.co/core42/jais-30b-chat-v3
|
||||
precision: float16
|
||||
- group: DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
@@ -1,152 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.8.3
|
||||
pytorch_version: 2.7.0 (dev nightly)
|
||||
hipblaslt_version: 0.13
|
||||
model_groups:
|
||||
- group: Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
@@ -1,152 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250513
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.8.5
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: Databricks DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Google Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
@@ -1,167 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
|
||||
rocm_version: 6.3.1
|
||||
vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.2 11B Vision
|
||||
mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
|
||||
model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: Databricks DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Google Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
- group: TII Falcon
|
||||
tag: falcon
|
||||
models:
|
||||
- model: Falcon 180B
|
||||
mad_tag: pyt_vllm_falcon-180b
|
||||
model_repo: tiiuae/falcon-180B
|
||||
url: https://huggingface.co/tiiuae/falcon-180B
|
||||
precision: float16
|
||||
@@ -1,162 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: Databricks DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Google Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
- group: TII Falcon
|
||||
tag: falcon
|
||||
models:
|
||||
- model: Falcon 180B
|
||||
mad_tag: pyt_vllm_falcon-180b
|
||||
model_repo: tiiuae/falcon-180B
|
||||
url: https://huggingface.co/tiiuae/falcon-180B
|
||||
precision: float16
|
||||
@@ -1,163 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
# TODO: update me
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: Databricks DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Google Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
- group: TII Falcon
|
||||
tag: falcon
|
||||
models:
|
||||
- model: Falcon 180B
|
||||
mad_tag: pyt_vllm_falcon-180b
|
||||
model_repo: tiiuae/falcon-180B
|
||||
url: https://huggingface.co/tiiuae/falcon-180B
|
||||
precision: float16
|
||||
@@ -1,163 +0,0 @@
|
||||
vllm_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
# TODO: update me
|
||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
||||
rocm_version: 6.4.1
|
||||
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
||||
pytorch_version: 2.7.0+gitf717b2a
|
||||
hipblaslt_version: 0.15
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_vllm_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_vllm_llama-2-7b
|
||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
- model: Mistral 7B
|
||||
mad_tag: pyt_vllm_mistral-7b
|
||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||
precision: float16
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- model: Mistral 7B FP8
|
||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||
precision: float8
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen2 7B
|
||||
mad_tag: pyt_vllm_qwen2-7b
|
||||
model_repo: Qwen/Qwen2-7B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||
precision: float16
|
||||
- model: Qwen2 72B
|
||||
mad_tag: pyt_vllm_qwen2-72b
|
||||
model_repo: Qwen/Qwen2-72B-Instruct
|
||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||
precision: float16
|
||||
- model: QwQ-32B
|
||||
mad_tag: pyt_vllm_qwq-32b
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- group: Databricks DBRX
|
||||
tag: dbrx
|
||||
models:
|
||||
- model: DBRX Instruct
|
||||
mad_tag: pyt_vllm_dbrx-instruct
|
||||
model_repo: databricks/dbrx-instruct
|
||||
url: https://huggingface.co/databricks/dbrx-instruct
|
||||
precision: float16
|
||||
- model: DBRX Instruct FP8
|
||||
mad_tag: pyt_vllm_dbrx_fp8
|
||||
model_repo: amd/dbrx-instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||
precision: float8
|
||||
- group: Google Gemma
|
||||
tag: gemma
|
||||
models:
|
||||
- model: Gemma 2 27B
|
||||
mad_tag: pyt_vllm_gemma-2-27b
|
||||
model_repo: google/gemma-2-27b
|
||||
url: https://huggingface.co/google/gemma-2-27b
|
||||
precision: float16
|
||||
- group: Cohere
|
||||
tag: cohere
|
||||
models:
|
||||
- model: C4AI Command R+ 08-2024
|
||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||
precision: float16
|
||||
- model: C4AI Command R+ 08-2024 FP8
|
||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||
precision: float8
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek MoE 16B
|
||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
- group: TII Falcon
|
||||
tag: falcon
|
||||
models:
|
||||
- model: Falcon 180B
|
||||
mad_tag: pyt_vllm_falcon-180b
|
||||
model_repo: tiiuae/falcon-180B
|
||||
url: https://huggingface.co/tiiuae/falcon-180B
|
||||
precision: float16
|
||||
@@ -1,55 +0,0 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e
|
||||
ROCm: 7.9.0
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -1,109 +0,0 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
- version: v25-11
|
||||
pull_tag: rocm/pytorch-xdit:v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216
|
||||
ROCm: 7.10.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "Minor bug fixes and clarifications to READMEs."
|
||||
- "Bumps TheRock, AITER, Diffusers, xDiT versions."
|
||||
- "Changes Aiter rounding mode for faster gfx942 FWD Attention."
|
||||
components:
|
||||
TheRock: 3e3f834
|
||||
rccl: d23d18f
|
||||
composable_kernel: 2570462
|
||||
rocm-libraries: 0588f07
|
||||
rocm-systems: 473025a
|
||||
torch: 73adac
|
||||
torchvision: f5c6c2e
|
||||
triton: 7416ffc
|
||||
accelerate: 34c1779
|
||||
aiter: de14bec
|
||||
diffusers: 40528e9
|
||||
xfuser: 83978b5
|
||||
yunchang: 2c9b712
|
||||
|
||||
- version: v25-10
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.9.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "First official xDiT Docker Release for Diffusion Inference."
|
||||
- "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
|
||||
- "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
page_tag: hunyuan_tag
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
page_tag: wan_21_tag
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
page_tag: wan_22_tag
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
page_tag: flux_1_tag
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -1,91 +0,0 @@
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.12
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256
|
||||
ROCm: 7.10.0
|
||||
whats_new:
|
||||
- "Adds T2V and TI2V support for Wan models."
|
||||
- "Adds support for SD-3.5 T2I model."
|
||||
components:
|
||||
TheRock:
|
||||
version: 3e3f834
|
||||
url: https://github.com/ROCm/TheRock
|
||||
rccl:
|
||||
version: d23d18f
|
||||
url: https://github.com/ROCm/rccl
|
||||
composable_kernel:
|
||||
version: 2570462
|
||||
url: https://github.com/ROCm/composable_kernel
|
||||
rocm-libraries:
|
||||
version: 0588f07
|
||||
url: https://github.com/ROCm/rocm-libraries
|
||||
rocm-systems:
|
||||
version: 473025a
|
||||
url: https://github.com/ROCm/rocm-systems
|
||||
torch:
|
||||
version: 73adac
|
||||
url: https://github.com/pytorch/pytorch
|
||||
torchvision:
|
||||
version: f5c6c2e
|
||||
url: https://github.com/pytorch/vision
|
||||
triton:
|
||||
version: 7416ffc
|
||||
url: https://github.com/triton-lang/triton
|
||||
accelerate:
|
||||
version: 34c1779
|
||||
url: https://github.com/huggingface/accelerate
|
||||
aiter:
|
||||
version: de14bec
|
||||
url: https://github.com/ROCm/aiter
|
||||
diffusers:
|
||||
version: 40528e9
|
||||
url: https://github.com/huggingface/diffusers
|
||||
xfuser:
|
||||
version: ccba9d5
|
||||
url: https://github.com/xdit-project/xDiT
|
||||
yunchang:
|
||||
version: 2c9b712
|
||||
url: https://github.com/feifeibear/long-context-attention
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
js_tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
js_tag: hunyuan_tag
|
||||
- group: Wan-AI
|
||||
js_tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
js_tag: wan_21_tag
|
||||
- model: Wan2.2
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
js_tag: wan_22_tag
|
||||
- group: FLUX
|
||||
js_tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
js_tag: flux_1_tag
|
||||
- group: Stable Diffusion
|
||||
js_tag: stablediffusion
|
||||
models:
|
||||
- model: stable-diffusion-3.5-large
|
||||
model_repo: stabilityai/stable-diffusion-3.5-large
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
|
||||
github: https://github.com/Stability-AI/sd3.5
|
||||
mad_tag: pyt_xdit_sd_3_5
|
||||
js_tag: stable_diffusion_3_5_large_tag
|
||||
@@ -1,57 +0,0 @@
|
||||
pytorch_inference_benchmark:
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/pytorch:latest
|
||||
docker_hub_url:
|
||||
rocm_version:
|
||||
pytorch_version:
|
||||
hipblaslt_version:
|
||||
model_groups:
|
||||
- group: CLIP
|
||||
tag: clip
|
||||
models:
|
||||
- model: CLIP
|
||||
mad_tag: pyt_clip_inference
|
||||
model_repo: laion/CLIP-ViT-B-32-laion2B-s34B-b79K
|
||||
url: https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
|
||||
precision: float16
|
||||
- group: Chai-1
|
||||
tag: chai
|
||||
models:
|
||||
- model: Chai-1
|
||||
mad_tag: pyt_chai1_inference
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/chaidiscovery/chai-1
|
||||
precision: float16
|
||||
- group: Mochi Video
|
||||
tag: mochi
|
||||
models:
|
||||
- model: Mochi 1
|
||||
mad_tag: pyt_mochi_video_inference
|
||||
model_repo: genmo/mochi-1-preview
|
||||
url: https://huggingface.co/genmo/mochi-1-preview
|
||||
precision: float16
|
||||
- group: Wan2.1
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
mad_tag: pyt_wan2.1_inference
|
||||
model_repo: Wan-AI/Wan2.1-T2V-14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
||||
precision: bfloat16
|
||||
- group: Janus Pro
|
||||
tag: janus-pro
|
||||
models:
|
||||
- model: Janus Pro 7B
|
||||
mad_tag: pyt_janus_pro_inference
|
||||
model_repo: deepseek-ai/Janus-Pro-7B
|
||||
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
||||
precision: bfloat16
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
mad_tag: pyt_hy_video
|
||||
model_repo: tencent/HunyuanVideo
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
precision: float16
|
||||
@@ -1,16 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
||||
components:
|
||||
ROCm: 6.3.0
|
||||
SGLang: 0.4.5 (0.4.5-rocm)
|
||||
PyTorch: 2.6.0a0+git8d4926e
|
||||
model_groups:
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-R1-Distill-Qwen-32B
|
||||
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
||||
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||
precision: bfloat16
|
||||
@@ -1,32 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
|
||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
SGLang: v0.5.2rc1
|
||||
pytorch-triton-rocm: 3.4.0+rocm7.0.0.gitf9e5bf54
|
||||
model_groups:
|
||||
- group: Dense models
|
||||
tag: dense-models
|
||||
models:
|
||||
- model: Llama 3.1 8B Instruct
|
||||
model_repo: Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
||||
- model: Llama 3.1 405B FP8 KV
|
||||
model_repo: Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
- model: Llama 3.3 70B FP8 KV
|
||||
model_repo: amd-Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
- model: Qwen3 32B
|
||||
model_repo: Qwen3-32B
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
- group: Small experts models
|
||||
tag: small-experts-models
|
||||
models:
|
||||
- model: DeepSeek V3
|
||||
model_repo: DeepSeek-V3
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||
- model: Mixtral 8x7B v0.1
|
||||
model_repo: Mixtral-8x7B-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
|
||||
@@ -1,625 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.11.2 (0.11.2.dev673+g839868462.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 8398684622109c806a35d660647060b0b9910663
|
||||
|
||||
configs:
|
||||
default:
|
||||
## DeepSeek AITER MLA currently only supports --block-size 1
|
||||
- &deepseek-r1-serving
|
||||
benchmark: serving
|
||||
model: deepseek-ai/DeepSeek-R1-0528
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
block-size: 1
|
||||
## gpt-oss requires AITER unified attention and performs best with block-size 64 and FULL_AND_PIECEWISE cudagraph mode
|
||||
- &gpt-oss-120b-serving
|
||||
benchmark: serving
|
||||
model: openai/gpt-oss-120b
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER_MHA: 0
|
||||
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
block-size: 64
|
||||
compilation-config: '{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\"}'
|
||||
- &llama-3-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
meta-llama/Llama-3.1-405B-Instruct
|
||||
amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
meta-llama/Llama-3.3-70B-Instruct
|
||||
amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Llama 3.x MXFP4 (gfx950 only)
|
||||
- &llama-3-mxfp4-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
## Llama 4 currently does not support full cudagraph or attn fusion
|
||||
- &llama-4-fp8-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
compilation-config: '{\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"enable_attn_fusion\":false}}'
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
- &mixtral-8x22b-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1 8 32 128
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
extended:
|
||||
## gpt-oss requires AITER unified attention and performs best with block-size 64 and FULL_AND_PIECEWISE cudagraph mode
|
||||
- &gpt-oss-20b-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
openai/gpt-oss-20b
|
||||
tp: 1
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER_MHA: 0
|
||||
VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
block-size: 64
|
||||
compilation-config: '{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\"}'
|
||||
- &llama-3-8b-phi-4-qwen3-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
meta-llama/Llama-3.1-8B-Instruct
|
||||
amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
microsoft/phi-4
|
||||
Qwen/Qwen3-8B
|
||||
Qwen/Qwen3-32B
|
||||
Qwen/Qwen3-30B-A3B-Thinking-2507
|
||||
Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
|
||||
tp: 1
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
|
||||
- &llama-2-70b-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
meta-llama/Llama-2-70b-chat-hf
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Llama 4 currently does not support full cudagraph or attn fusion
|
||||
- &llama-4-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
compilation-config: '{\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"enable_attn_fusion\":false}}'
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
|
||||
- &mixtral-8x7b-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Qwen 235B requires --enable-expert-parallel with tp 8
|
||||
- &qwen3-235b-a22b-serving
|
||||
benchmark: serving
|
||||
model:
|
||||
Qwen/Qwen3-235B-A22B-Thinking-2507
|
||||
Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
|
||||
tp: 8
|
||||
inp: 1024
|
||||
out: 1024
|
||||
dtype: auto
|
||||
max_concurrency: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
enable-expert-parallel: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
accuracy:
|
||||
## DeepSeek AITER MLA currently only supports --block-size 1
|
||||
- &deepseek-r1-accuracy
|
||||
benchmark: accuracy
|
||||
model: deepseek-ai/DeepSeek-R1-0528
|
||||
tp: 8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
block-size: 1
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
## gpt-oss requires AITER unified attention and performs best with block-size 64 and FULL_AND_PIECEWISE cudagraph mode
|
||||
- &gpt-oss-120b-accuracy
|
||||
benchmark: accuracy
|
||||
model: openai/gpt-oss-120b
|
||||
tp: 8
|
||||
dtype: auto
|
||||
env:
|
||||
VLLM_ROCM_USE_AITER_MHA: 0
|
||||
VLLM_USE_AITER_UNIFIED_ATTENTION: 1
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
block-size: 64
|
||||
compilation-config: '{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\"}'
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
## Llama 3.x bf16 and fp8 perform better with --dtype float16 on gfx942
|
||||
- &llama-3-accuracy
|
||||
benchmark: accuracy
|
||||
model:
|
||||
meta-llama/Llama-3.1-405B-Instruct
|
||||
amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
meta-llama/Llama-3.3-70B-Instruct
|
||||
amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
tp: 8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Llama 3.x MXFP4 (gfx950 only)
|
||||
- &llama-3-mxfp4-accuracy
|
||||
benchmark: accuracy
|
||||
model:
|
||||
amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
tp: 8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
## Llama 4 currently does not support full cudagraph or attn fusion
|
||||
- &llama-4-fp8-accuracy
|
||||
benchmark: accuracy
|
||||
model:
|
||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
tp: 8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
compilation-config: '{\"cudagraph_mode\":\"PIECEWISE\",\"pass_config\":{\"enable_attn_fusion\":false}}'
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Mistral models require --tokenizer-mode mistral for correct decoding
|
||||
- &mixtral-8x22b-accuracy
|
||||
benchmark: accuracy
|
||||
model:
|
||||
mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
tp: 8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
## Qwen 235B requires --enable-expert-parallel with tp 8
|
||||
- &qwen3-235b-a22b-accuracy
|
||||
benchmark: accuracy
|
||||
model:
|
||||
Qwen/Qwen3-235B-A22B-Thinking-2507
|
||||
Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
|
||||
dtype: auto
|
||||
extra_args:
|
||||
async-scheduling: True
|
||||
enable-expert-parallel: True
|
||||
bench_args:
|
||||
apply_chat_template: True
|
||||
arch_overrides:
|
||||
gfx942:
|
||||
dtype: float16
|
||||
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-2-70b-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-serving
|
||||
accuracy: *llama-3-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
serving: *llama-3-serving
|
||||
accuracy: *llama-3-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
serving: *llama-3-mxfp4-serving
|
||||
accuracy: *llama-3-mxfp4-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-serving
|
||||
accuracy: *llama-3-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
serving: *llama-3-serving
|
||||
accuracy: *llama-3-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
serving: *llama-3-mxfp4-serving
|
||||
accuracy: *llama-3-mxfp4-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-4-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-4-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
serving: *llama-4-fp8-serving
|
||||
accuracy: *llama-4-fp8-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
serving: *deepseek-r1-serving
|
||||
accuracy: *deepseek-r1-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
serving: *gpt-oss-20b-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
serving: *gpt-oss-120b-serving
|
||||
accuracy: *gpt-oss-120b-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
serving: *mixtral-8x7b-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
serving: *mixtral-8x7b-serving
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
serving: *mixtral-8x22b-serving
|
||||
accuracy: *mixtral-8x22b-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
serving: *mixtral-8x22b-serving
|
||||
accuracy: *mixtral-8x22b-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32B
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B Thinking
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B-Thinking-2507
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B Thinking FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B Thinking
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B-Thinking-2507
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507
|
||||
precision: float16
|
||||
config:
|
||||
serving: *qwen3-235b-a22b-serving
|
||||
accuracy: *qwen3-235b-a22b-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B Thinking FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
|
||||
precision: float8
|
||||
config:
|
||||
serving: *qwen3-235b-a22b-serving
|
||||
accuracy: *qwen3-235b-a22b-accuracy
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
serving: *llama-3-8b-phi-4-qwen3-serving
|
||||
ex:
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,105 +0,0 @@
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.13
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
|
||||
ROCm: 7.11.0
|
||||
whats_new:
|
||||
- "Flux.1 Kontext support"
|
||||
- "Flux.2 Dev support"
|
||||
- "Flux FP8 GEMM support"
|
||||
- "Hybrid FP8 attention support for Wan models"
|
||||
components:
|
||||
TheRock:
|
||||
version: 1728a81
|
||||
url: https://github.com/ROCm/TheRock
|
||||
rccl:
|
||||
version: d23d18f
|
||||
url: https://github.com/ROCm/rccl
|
||||
composable_kernel:
|
||||
version: ab0101c
|
||||
url: https://github.com/ROCm/composable_kernel
|
||||
rocm-libraries:
|
||||
version: a2f7c35
|
||||
url: https://github.com/ROCm/rocm-libraries
|
||||
rocm-systems:
|
||||
version: 659737c
|
||||
url: https://github.com/ROCm/rocm-systems
|
||||
torch:
|
||||
version: 91be249
|
||||
url: https://github.com/ROCm/pytorch
|
||||
torchvision:
|
||||
version: b919bd0
|
||||
url: https://github.com/pytorch/vision
|
||||
triton:
|
||||
version: a272dfa
|
||||
url: https://github.com/ROCm/triton
|
||||
accelerate:
|
||||
version: b521400f
|
||||
url: https://github.com/huggingface/accelerate
|
||||
aiter:
|
||||
version: de14bec0
|
||||
url: https://github.com/ROCm/aiter
|
||||
diffusers:
|
||||
version: a1f36ee3e
|
||||
url: https://github.com/huggingface/diffusers
|
||||
xfuser:
|
||||
version: adf2681
|
||||
url: https://github.com/xdit-project/xDiT
|
||||
yunchang:
|
||||
version: 2c9b712
|
||||
url: https://github.com/feifeibear/long-context-attention
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
js_tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
js_tag: hunyuan_tag
|
||||
- group: Wan-AI
|
||||
js_tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
js_tag: wan_21_tag
|
||||
- model: Wan2.2
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
js_tag: wan_22_tag
|
||||
- group: FLUX
|
||||
js_tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
js_tag: flux_1_tag
|
||||
- model: FLUX.1 Kontext
|
||||
model_repo: black-forest-labs/FLUX.1-Kontext-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux_kontext
|
||||
js_tag: flux_1_kontext_tag
|
||||
- model: FLUX.2
|
||||
model_repo: black-forest-labs/FLUX.2-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.2-dev
|
||||
github: https://github.com/black-forest-labs/flux2
|
||||
mad_tag: pyt_xdit_flux_2
|
||||
js_tag: flux_2_tag
|
||||
- group: StableDiffusion
|
||||
js_tag: stablediffusion
|
||||
models:
|
||||
- model: stable-diffusion-3.5-large
|
||||
model_repo: stabilityai/stable-diffusion-3.5-large
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
|
||||
github: https://github.com/Stability-AI/sd3.5
|
||||
mad_tag: pyt_xdit_sd_3_5
|
||||
js_tag: stable_diffusion_3_5_large_tag
|
||||
|
Before Width: | Height: | Size: 36 KiB |
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 242 KiB |
|
Before Width: | Height: | Size: 155 KiB |
|
Before Width: | Height: | Size: 242 KiB |
@@ -1,88 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v26.1
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v26.1/images/sha256-901083bde353fe6362ada3036e452c792b2c96124e5900f4e9b5946c02ff9d6a
|
||||
components:
|
||||
ROCm: 7.1.1
|
||||
JAX: 0.8.2
|
||||
Python: 3.12
|
||||
Transformer Engine: 2.8.0.dev0+aec00a7f
|
||||
hipBLASLt: 1.2.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama2_7b.yml
|
||||
gfx942: env_scripts/llama2_7b.yml
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama2_70b.yml
|
||||
gfx942: env_scripts/llama2_70b.yml
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 3 8B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama3_8b.yml
|
||||
gfx942: env_scripts/llama3_8b.yml
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama3_70b.yml
|
||||
gfx942: env_scripts/llama3_70b.yml
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 405B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama3_405b.yml
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_llama3.3_70b.yml
|
||||
gfx942: env_scripts/llama3.3_70b.yml
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_deepseek2_16b.yml
|
||||
gfx942: env_scripts/deepseek2_16b.yml
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
multinode_config:
|
||||
gfx950: env_scripts/gfx950_mixtral_8x7b.yml
|
||||
gfx942: env_scripts/llama3_8x7b.yml
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
@@ -1,47 +0,0 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v26.1
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.6.0.dev0+f141f34b
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 34459f66ea
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -1,64 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
JAX: 0.7.1
|
||||
Python: 3.12
|
||||
Transformer Engine: 2.4.0.dev0+281042de
|
||||
hipBLASLt: 1.2.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_7b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_70b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 3 8B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_training_script: llama3_8b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_training_script: llama3_70b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
@@ -1,72 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.6.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.1.0-499ece1c21
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_training_script: llama3_8b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_training_script: llama3_70b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_7b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_70b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
@@ -1,64 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.9.1
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
JAX: 0.6.2
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+c91bac54
|
||||
hipBLASLt: 1.x.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_7b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_70b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 3 8B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_training_script: llama3_8b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_training_script: llama3_70b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
@@ -1,49 +0,0 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||