Merge pull request #612 from ROCm/sync-develop-from-external

Sync develop from external for 7.1.0 GA
This commit is contained in:
alexxu-amd
2025-10-29 17:13:01 -04:00
committed by GitHub
29 changed files with 1832 additions and 810 deletions

View File

@@ -130,7 +130,7 @@ jobs:
parameters:
componentName: hipTensor
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}

View File

@@ -1,10 +1,35 @@
parameters:
- name: componentName
type: string
default: rccl
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
- name: systemsRepo
type: string
default: systems_repo
- name: systemsSparseCheckoutDir
type: string
default: 'projects/rocprofiler-sdk'
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -57,19 +82,28 @@ parameters:
type: object
default:
buildJobs:
- gfx942:
target: gfx942
- gfx90a:
target: gfx90a
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
testJobs:
- gfx942:
target: gfx942
- gfx90a:
target: gfx90a
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
type: object
default:
- rocprofiler-sdk:
name: rocprofiler-sdk
sparseCheckoutDir: ''
skipUnifiedBuild: 'false'
buildDependsOn:
- rccl_build
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rccl_build_${{ job.target }}
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}_${{ job.target }}
timeoutInMinutes: 120
variables:
- group: common
@@ -77,17 +111,23 @@ jobs:
- name: HIP_ROCCLR_HOME
value: $(Build.BinariesDirectory)/rocm
pool: ${{ variables.MEDIUM_BUILD_POOL }}
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
submoduleBehaviour: recursive
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
parameters:
@@ -97,10 +137,14 @@ jobs:
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
extraBuildFlags: >-
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
@@ -112,58 +156,87 @@ jobs:
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
gpuTarget: ${{ job.target }}
extraEnvVars:
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
installLatestCMake: true
- ${{ if eq(job.os, 'ubuntu2204') }}:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
gpuTarget: ${{ job.target }}
extraEnvVars:
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
installLatestCMake: true
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rccl_test_${{ job.target }}
timeoutInMinutes: 120
dependsOn: rccl_build_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rccl
testDir: '$(Agent.BuildDirectory)/rocm/bin'
testExecutable: './rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
timeoutInMinutes: 120
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
testDir: '$(Agent.BuildDirectory)/rocm/bin'
testExecutable: './rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ if parameters.triggerDownstreamJobs }}:
- ${{ each component in parameters.downstreamComponentMatrix }}:
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
parameters:
checkoutRepo: ${{ parameters.systemsRepo }}
sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
triggerDownstreamJobs: true
unifiedBuild: ${{ parameters.unifiedBuild }}
${{ if parameters.unifiedBuild }}:
buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
${{ else }}:
buildDependsOn: ${{ component.buildDependsOn }}
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}

View File

@@ -81,7 +81,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocm-cmake
testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:

View File

@@ -21,13 +21,19 @@ parameters:
- libtbb-dev
- libtiff-dev
- libva-amdgpu-dev
- libavcodec-dev
- libavformat-dev
- libavutil-dev
- ninja-build
- python3-pip
- name: rocmDependencies
type: object
default:
- AMDMIGraphX
- aomp
- aomp-extras
- clr
- composable_kernel
- hipBLAS
- hipBLAS-common
- hipBLASLt
@@ -40,7 +46,10 @@ parameters:
- hipTensor
- llvm-project
- MIOpen
- MIVisionX
- rocALUTION
- rocBLAS
- rocDecode
- rocFFT
- rocJPEG
- rocPRIM
@@ -57,7 +66,10 @@ parameters:
type: object
default:
- AMDMIGraphX
- aomp
- aomp-extras
- clr
- composable_kernel
- hipBLAS
- hipBLAS-common
- hipBLASLt
@@ -70,7 +82,10 @@ parameters:
- hipTensor
- llvm-project
- MIOpen
- MIVisionX
- rocALUTION
- rocBLAS
- rocDecode
- rocFFT
- rocminfo
- rocPRIM

View File

@@ -79,27 +79,27 @@ parameters:
type: object
default:
buildJobs:
- gfx942:
target: gfx942
- gfx90a:
target: gfx90a
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
testJobs:
- gfx942:
target: gfx942
- gfx90a:
target: gfx90a
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocprofiler_sdk_build_${{ job.target }}
- job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.target }}
- ${{ build }}_${{ job.os}}_${{ job.target }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.MEDIUM_BUILD_POOL }}
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
workspace:
clean: all
steps:
@@ -107,6 +107,7 @@ jobs:
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -118,6 +119,7 @@ jobs:
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
${{ if parameters.triggerDownstreamJobs }}:
@@ -132,6 +134,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCPROFILER_BUILD_TESTS=ON
@@ -143,6 +146,7 @@ jobs:
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
@@ -158,8 +162,8 @@ jobs:
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocprofiler_sdk_test_${{ job.target }}
dependsOn: rocprofiler_sdk_build_${{ job.target }}
- job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -177,6 +181,7 @@ jobs:
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -188,6 +193,7 @@ jobs:
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -202,6 +208,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCPROFILER_BUILD_TESTS=ON
@@ -213,7 +220,8 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testDir: $(Agent.BuildDirectory)/s/build
os: ${{ job.os }}
testDir: $(Agent.BuildDirectory)/build
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}

View File

@@ -13,7 +13,7 @@ parameters:
default: ctest
- name: testParameters
type: string
default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
- name: extraTestParameters
type: string
default: ''

View File

@@ -27,6 +27,7 @@ ASICs
ASan
ASAN
ASm
Async
ATI
atomicRMW
AddressSanitizer
@@ -34,6 +35,7 @@ AlexNet
Andrej
Arb
Autocast
autograd
BARs
BatchNorm
BLAS
@@ -86,9 +88,11 @@ Conda
ConnectX
CountOnes
CuPy
customizable
da
Dashboarding
Dataloading
dataflows
DBRX
DDR
DF
@@ -130,6 +134,7 @@ ELMo
ENDPGM
EPYC
ESXi
EP
EoS
etcd
fas
@@ -181,8 +186,9 @@ GPR
GPT
GPU
GPU's
GPUDirect
GPUs
Graphbolt
GraphBolt
GraphSage
GRBM
GRE
@@ -212,6 +218,7 @@ Haswell
Higgs
href
Hyperparameters
HybridEngine
Huggingface
IB
ICD
@@ -298,6 +305,7 @@ Makefiles
Matplotlib
Matrox
MaxText
MBT
Megablocks
Megatrends
Megatron
@@ -307,6 +315,7 @@ Meta's
Miniconda
MirroredStrategy
Mixtral
MLA
MosaicML
MoEs
Mooncake
@@ -349,6 +358,7 @@ OFED
OMM
OMP
OMPI
OOM
OMPT
OMPX
ONNX
@@ -394,6 +404,7 @@ Profiler's
PyPi
Pytest
PyTorch
QPS
Qcycles
Qwen
RAII
@@ -669,6 +680,7 @@ denoised
denoises
denormalize
dequantization
dequantized
dequantizes
deserializers
detections
@@ -784,6 +796,7 @@ linalg
linearized
linter
linux
llm
llvm
lm
localscratch
@@ -834,6 +847,7 @@ passthrough
pe
perfcounter
performant
piecewise
perl
pragma
pre
@@ -980,6 +994,7 @@ tokenizer
tokenizes
toolchain
toolchains
topk
toolset
toolsets
torchtitan
@@ -1007,6 +1022,7 @@ USM
UTCL
UTIL
utils
UX
vL
variational
vdi

View File

@@ -2,7 +2,7 @@
.. meta::
:description: Deep Graph Library (DGL) compatibility
:keywords: GPU, DGL compatibility
:keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -10,24 +10,42 @@
DGL compatibility
********************************************************************************
Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable
Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable
Python package for deep learning on graphs. DGL is framework agnostic, meaning
if a deep graph model is a component in an end-to-end application, the rest of
that if a deep graph model is a component in an end-to-end application, the rest of
the logic is implemented using PyTorch.
* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository.
* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository.
* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
to install and get started.
DGL provides a high-performance graph object that can reside on either CPUs or GPUs.
It bundles structural data features for better control and provides a variety of functions
for computing with graph objects, including efficient and customizable message passing
primitives for Graph Neural Networks.
Supported devices
Support overview
================================================================================
- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
- **Partially Supported**: TF32 with AMD Instinct MI250X
- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl
<https://github.com/ROCm/dgl>`__ repository, which differs from the
`https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.
- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`,
which include ROCm, DGL, and all required dependencies.
- See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__
for additional context.
Version support
--------------------------------------------------------------------------------
DGL is supported on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
Supported devices
--------------------------------------------------------------------------------
- **Officially Supported**: AMD Instinct™ MI300X (through `hipBLASlt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`__)
- **Partially Supported**: AMD Instinct™ MI250X
.. _dgl-recommendations:
@@ -35,7 +53,7 @@ Use cases and recommendations
================================================================================
DGL can be used for Graph Learning, and building popular graph models like
GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
- Recommender systems
- Network Optimization and Analysis
@@ -62,16 +80,17 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
with ROCm backends on Docker Hub. The following Docker image tags and associated
inventories represent the latest available DGL version from the official Docker Hub.
Click the |docker-icon| to view the image on Docker Hub.
.. list-table:: DGL Docker image components
:header-rows: 1
:class: docker-image-compatibility
* - Docker
* - Docker image
- ROCm
- DGL
- PyTorch
- Ubuntu
@@ -81,102 +100,106 @@ Click the |docker-icon| to view the image on Docker Hub.
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
- 24.04
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
- 24.04
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
- 22.04
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
- 22.04
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
Key ROCm libraries for DGL
================================================================================
DGL on ROCm depends on specific libraries that affect its features and performance.
Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
If you prefer to build it yourself, ensure the following dependencies are installed:
.. list-table::
:header-rows: 1
* - ROCm library
- Version
- ROCm 6.4.0 Version
- Purpose
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
- :version-ref:`"Composable Kernel" rocm_version`
- 1.1.0
- Enables faster execution of core operations like matrix multiplication
(GEMM), convolutions and transformations.
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
- :version-ref:`hipBLAS rocm_version`
- 2.4.0
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
matrix and vector operations.
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
- :version-ref:`hipBLASLt rocm_version`
- 0.12.0
- hipBLASLt is an extension of the hipBLAS library, providing additional
features like epilogues fused into the matrix multiplication kernel or
use of integer tensor cores.
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
- :version-ref:`hipCUB rocm_version`
- 3.4.0
- Provides a C++ template library for parallel algorithms for reduction,
scan, sort and select.
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
- :version-ref:`hipFFT rocm_version`
- 1.0.18
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
- :version-ref:`hipRAND rocm_version`
- 2.12.0
- Provides fast random number generation for GPUs.
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
- :version-ref:`hipSOLVER rocm_version`
- 2.4.0
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
singular value decompositions (SVD).
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
- :version-ref:`hipSPARSE rocm_version`
- 3.2.0
- Accelerates operations on sparse matrices, such as sparse matrix-vector
or matrix-matrix products.
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
- :version-ref:`hipSPARSELt rocm_version`
- 0.2.3
- Accelerates operations on sparse matrices, such as sparse matrix-vector
or matrix-matrix products.
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
- :version-ref:`hipTensor rocm_version`
- 1.5.0
- Optimizes for high-performance tensor operations, such as contractions.
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
- :version-ref:`MIOpen rocm_version`
- 3.4.0
- Optimizes deep learning primitives such as convolutions, pooling,
normalization, and activation functions.
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
- :version-ref:`MIGraphX rocm_version`
- 2.12.0
- Adds graph-level optimizations, ONNX models and mixed precision support
and enable Ahead-of-Time (AOT) Compilation.
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
- :version-ref:`MIVisionX rocm_version`
- 3.2.0
- Optimizes acceleration for computer vision and AI workloads like
preprocessing, augmentation, and inferencing.
* - `rocAL <https://github.com/ROCm/rocAL>`_
@@ -184,25 +207,25 @@ If you prefer to build it yourself, ensure the following dependencies are instal
- Accelerates the data pipeline by offloading intensive preprocessing and
augmentation tasks. rocAL is part of MIVisionX.
* - `RCCL <https://github.com/ROCm/rccl>`_
- :version-ref:`RCCL rocm_version`
- 2.2.0
- Optimizes for multi-GPU communication for operations like AllReduce and
Broadcast.
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
- :version-ref:`rocDecode rocm_version`
- 0.10.0
- Provides hardware-accelerated data decoding capabilities, particularly
for image, video, and other dataset formats.
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
- :version-ref:`rocJPEG rocm_version`
- 0.8.0
- Provides hardware-accelerated JPEG image decoding and encoding.
* - `RPP <https://github.com/ROCm/RPP>`_
- :version-ref:`RPP rocm_version`
- 1.9.10
- Speeds up data augmentation, transformation, and other preprocessing steps.
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
- :version-ref:`rocThrust rocm_version`
- 3.3.0
- Provides a C++ template library for parallel algorithms like sorting,
reduction, and scanning.
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
- :version-ref:`rocWMMA rocm_version`
- 1.7.0
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
multiplication (GEMM) and accumulation operations with mixed precision
support.
@@ -211,14 +234,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal
Supported features
================================================================================
Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
Many functions and methods available upstream are also supported in DGL on ROCm.
Instead of listing them all, support is grouped into the following categories to provide a general overview.
* DGL Base
* DGL Backend
* DGL Data
* DGL Dataloading
* DGL DGLGraph
* DGL Graph
* DGL Function
* DGL Ops
* DGL Sampling
@@ -235,9 +258,9 @@ Instead of listing them all, support is grouped into the following categories to
Unsupported features
================================================================================
* Graphbolt
* Partial TF32 Support (MI250x only)
* Kineto/ ROCTracer integration
* GraphBolt
* Partial TF32 Support (MI250X only)
* Kineto/ROCTracer integration
Unsupported functions

View File

@@ -1,8 +1,8 @@
:orphan:
.. meta::
:description: FlashInfer deep learning framework compatibility
:keywords: GPU, LLM, FlashInfer, compatibility
:description: FlashInfer compatibility
:keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -11,7 +11,7 @@ FlashInfer compatibility
********************************************************************************
`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator
for Large Language Models (LLMs) that provides high-performance implementation of graphics
for Large Language Models (LLMs) that provides a high-performance implementation of graphics
processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well
as advanced performance across diverse scenarios.
@@ -25,28 +25,30 @@ offers high-performance LLM-specific operators, with easy integration through Py
For the latest feature compatibility matrix, refer to the ``README`` of the
`https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
Support for the ROCm port of FlashInfer is available as follows:
Support overview
================================================================================
- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer
<https://github.com/ROCm/flashinfer>`__ repository. This location differs from the
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_
- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer
<https://github.com/ROCm/flashinfer>`__ repository, which differs from the
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__
upstream repository.
- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`,
which includes ROCm, FlashInfer, and all required dependencies.
- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`,
which include ROCm, FlashInfer, and all required dependencies.
- See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
to install and get started.
for installation and setup instructions.
- See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__
in the upstream FlashInfer documentation.
- You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__
for additional context.
.. note::
Version support
--------------------------------------------------------------------------------
Flashinfer is supported on ROCm 6.4.1.
FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
Supported devices
================================================================================
--------------------------------------------------------------------------------
**Officially Supported**: AMD Instinct™ MI300X
@@ -78,10 +80,9 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
inventories represent the FlashInfer version from the official Docker Hub.
The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
with ROCm backends on Docker Hub. The following Docker image tag and associated
inventories represent the latest available FlashInfer version from the official Docker Hub.
Click |docker-icon| to view the image on Docker Hub.
.. list-table::

View File

@@ -2,7 +2,7 @@
.. meta::
:description: JAX compatibility
:keywords: GPU, JAX compatibility
:keywords: GPU, JAX, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -10,42 +10,38 @@
JAX compatibility
*******************************************************************************
JAX provides a NumPy-like API, which combines automatic differentiation and the
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
learning at scale.
`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library
for array-oriented numerical computation (similar to NumPy), with automatic differentiation
and just-in-time (JIT) compilation to enable high-performance machine learning research.
JAX uses composable transformations of Python and NumPy through just-in-time
(JIT) compilation, automatic vectorization, and parallelization. To learn about
JAX, including profiling and optimizations, see the official `JAX documentation
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
JAX provides an API that combines automatic differentiation and the
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
learning at scale. JAX uses composable transformations of Python and NumPy through
JIT compilation, automatic vectorization, and parallelization.
ROCm support for JAX is upstreamed, and users can build the official source code
with ROCm support:
Support overview
================================================================================
- ROCm JAX release:
- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax
<https://github.com/ROCm/rocm-jax>`__ repository, which differs from the
`https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
with ROCm and JAX preinstalled.
- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`,
which include ROCm, JAX, and all required dependencies.
- ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
for installation and setup instructions.
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
to get started.
- You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__
for additional context.
- Official JAX release:
Version support
--------------------------------------------------------------------------------
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
- See the `AMD GPU (Linux) installation section
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
the JAX documentation.
.. note::
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
quarterly alongside new ROCm releases. These images undergo full AMD testing.
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
follow upstream JAX releases and use the latest available ROCm version.
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
quarterly alongside new ROCm releases. These images undergo full AMD testing.
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
follow upstream JAX releases and use the latest available ROCm version.
Use cases and recommendations
================================================================================
@@ -71,7 +67,7 @@ Use cases and recommendations
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
outlines the process of fine-tuning a Bidirectional Encoder Representations
from Transformers (BERT)-based large language model (LLM) using JAX for a text
classification task. The blog post discuss techniques for parallelizing the
classification task. The blog post discusses techniques for parallelizing the
fine-tuning across multiple AMD GPUs and assess the model's performance on a
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
@@ -90,9 +86,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
Docker image compatibility
================================================================================
AMD provides preconfigured Docker images with JAX and the ROCm backend.
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
recommended way to get started with deep learning with JAX on ROCm.
AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
with ROCm backends on Docker Hub.
For ``jax-community`` images, see `rocm/jax-community
<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
@@ -234,7 +230,7 @@ The ROCm supported data types in JAX are collected in the following table.
.. note::
JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
page.

View File

@@ -1,8 +1,8 @@
:orphan:
.. meta::
:description: llama.cpp deep learning framework compatibility
:keywords: GPU, GGML, llama.cpp compatibility
:description: llama.cpp compatibility
:keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -20,33 +20,32 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first
llama.cpp is easy to integrate with other programming environments and is widely
adopted across diverse platforms, including consumer devices.
ROCm support for llama.cpp is upstreamed, and you can build the official source code
with ROCm support:
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
<https://github.com/ROCm/llama.cpp>`_ repository.
- Due to independent compatibility considerations, this location differs from the
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
which includes ROCm, llama.cpp, and all required dependencies.
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
to install and get started.
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
in the upstream llama.cpp documentation.
.. note::
llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
Supported devices
Support overview
================================================================================
**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp
<https://github.com/ROCm/llama.cpp>`__ repository, which differs from the
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.
- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`,
which include ROCm, llama.cpp, and all required dependencies.
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__
for additional context.
Version support
--------------------------------------------------------------------------------
llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
Supported devices
--------------------------------------------------------------------------------
**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
Use cases and recommendations
================================================================================
@@ -84,9 +83,9 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
with ROCm backends on Docker Hub. The following Docker image tags and associated
inventories represent the available llama.cpp versions from the official Docker Hub.
inventories represent the latest available llama.cpp versions from the official Docker Hub.
Click |docker-icon| to view the image on Docker Hub.
.. important::

View File

@@ -2,7 +2,7 @@
.. meta::
:description: Megablocks compatibility
:keywords: GPU, megablocks, compatibility
:keywords: GPU, megablocks, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -10,28 +10,42 @@
Megablocks compatibility
********************************************************************************
Megablocks is a light-weight library for mixture-of-experts (MoE) training.
`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library
for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training.
The core of the system is efficient "dropless-MoE" and standard MoE layers.
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_,
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM
<https://github.com/stanford-futuredata/Megatron-LM>`__,
where data and pipeline parallel training of MoEs is supported.
* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository.
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled.
* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
Support overview
================================================================================
.. note::
- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks
<https://github.com/ROCm/megablocks>`__ repository, which differs from the
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
Megablocks is supported on ROCm 6.3.0.
- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`,
which includes ROCm, Megablocks, and all required dependencies.
- See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__
for additional context.
Version support
--------------------------------------------------------------------------------
Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
Supported devices
================================================================================
--------------------------------------------------------------------------------
- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
Supported models and features
================================================================================
--------------------------------------------------------------------------------
This section summarizes the Megablocks features supported by ROCm.
@@ -41,20 +55,28 @@ This section summarizes the Megablocks features supported by ROCm.
* Mixture-of-Experts
* dropless-Mixture-of-Experts
.. _megablocks-recommendations:
Use cases and recommendations
================================================================================
The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_
guide how to leverage the ROCm platform for pre-training using the Megablocks framework.
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
blog post guides how to leverage the ROCm platform for pre-training using the
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
training. The guide provides step-by-step instructions for setting up the environment,
including cloning the repository, building the Docker image, and running the training container.
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
training workflows for large-scale transformer models.
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
* Single-GPU pre-training
* Multi-GPU pre-training
.. _megablocks-docker-compat:
Docker image compatibility
@@ -64,10 +86,9 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
inventories represent the latest Megatron-LM version from the official Docker Hub.
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
with ROCm backends on Docker Hub. The following Docker image tag and associated
inventories represent the latest available Megablocks version from the official Docker Hub.
Click |docker-icon| to view the image on Docker Hub.
.. list-table::

View File

@@ -2,7 +2,7 @@
.. meta::
:description: PyTorch compatibility
:keywords: GPU, PyTorch compatibility
:keywords: GPU, PyTorch, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -15,40 +15,42 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
to independent compatibility considerations, this results in two distinct
release cycles for PyTorch on ROCm:
PyTorch provides two high-level features:
- ROCm PyTorch release:
- Tensor computation (like NumPy) with strong GPU acceleration
- Provides the latest version of ROCm but might not necessarily support the
latest stable PyTorch version.
- Deep neural networks built on a tape-based autograd system (rapid computation
of multiple partial derivatives or gradients)
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
preinstalled.
Support overview
================================================================================
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
ROCm support for PyTorch is upstreamed into the official PyTorch repository.
ROCm development is aligned with the stable release of PyTorch, while upstream
PyTorch testing uses the stable release of ROCm to maintain consistency:
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
to get started.
- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch
<https://github.com/ROCm/pytorch>`__ repository, which differs from the
`https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.
- Official PyTorch release:
- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`,
which include ROCm, PyTorch, and all required dependencies.
- Provides the latest stable version of PyTorch but might not necessarily
support the latest ROCm version.
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
for installation and setup instructions.
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
to get started.
- You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or
`Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.
PyTorch includes tooling that generates HIP source code from the CUDA backend.
This approach allows PyTorch to support ROCm without requiring manual code
modifications. For more information, see :doc:`HIPIFY <hipify:index>`.
ROCm development is aligned with the stable release of PyTorch, while upstream
PyTorch testing uses the stable release of ROCm to maintain consistency.
Version support
--------------------------------------------------------------------------------
AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
quarterly alongside new ROCm releases. These images undergo full AMD testing.
.. _pytorch-recommendations:
@@ -78,7 +80,7 @@ Use cases and recommendations
GPU.
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
describes how PyTorch integrates with ROCm for AI workloads It outlines the
describes how PyTorch integrates with ROCm for AI workloads. It outlines the
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
GPU hardware for training and inference tasks in AI applications.
@@ -89,9 +91,8 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
Docker image compatibility
================================================================================
AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
recommended way to get started with deep learning with PyTorch on ROCm.
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
with ROCm backends on Docker Hub.
To find the right image tag, see the :ref:`PyTorch on ROCm installation
documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of

View File

@@ -1,8 +1,8 @@
:orphan:
.. meta::
:description: Ray deep learning framework compatibility
:keywords: GPU, Ray compatibility
:description: Ray compatibility
:keywords: GPU, Ray, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -19,36 +19,35 @@ simplifying machine learning computations.
Ray is a general-purpose framework that runs many types of workloads efficiently.
Any Python application can be scaled with Ray, without extra infrastructure.
ROCm support for Ray is upstreamed, and you can build the official source code
with ROCm support:
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
<https://github.com/ROCm/ray>`_ repository.
- Due to independent compatibility considerations, this location differs from the
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
which includes ROCm, Ray, and all required dependencies.
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
for instructions to get started.
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
in the upstream Ray documentation.
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
.. note::
Ray is supported on ROCm 6.4.1.
Supported devices
Support overview
================================================================================
**Officially Supported**: AMD Instinct™ MI300X, MI210
- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray
<https://github.com/ROCm/ray>`__ repository, which differs from the
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.
- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`,
which includes ROCm, Ray, and all required dependencies.
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels
<https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__
for additional context.
Version support
--------------------------------------------------------------------------------
Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
Supported devices
--------------------------------------------------------------------------------
**Officially Supported**: AMD Instinct™ MI300X, MI210
Use cases and recommendations
================================================================================
@@ -88,15 +87,15 @@ Docker image compatibility
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
with ROCm backends on Docker Hub. The following Docker image tags and
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
icon to view the image on Docker Hub.
associated inventories represent the latest Ray version from the official Docker Hub.
Click the |docker-icon| icon to view the image on Docker Hub.
.. list-table::
:header-rows: 1
:class: docker-image-compatibility
* - Docker image
- ROCm
- Ray
- Pytorch
- Ubuntu
@@ -105,6 +104,7 @@ icon to view the image on Docker Hub.
* - .. raw:: html
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
- 2.6.0+git684f6f2
- 24.04

View File

@@ -2,7 +2,7 @@
.. meta::
:description: Stanford Megatron-LM compatibility
:keywords: Stanford, Megatron-LM, compatibility
:keywords: Stanford, Megatron-LM, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -10,34 +10,50 @@
Stanford Megatron-LM compatibility
********************************************************************************
Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
designed to train massive transformer-based language models efficiently by model and data parallelism.
Stanford Megatron-LM is a large-scale language model training framework developed
by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_.
It is designed to train massive transformer-based language models efficiently by model
and data parallelism.
* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository.
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled.
* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
It provides efficient tensor, pipeline, and sequence-based model parallelism for
pre-training transformer-based language models such as GPT (Decoder Only), BERT
(Encoder Only), and T5 (Encoder-Decoder).
.. note::
Stanford Megatron-LM is supported on ROCm 6.3.0.
Supported Devices
Support overview
================================================================================
- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM
<https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`,
which includes ROCm, Stanford Megatron-LM, and all required dependencies.
- See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__
for additional context.
Version support
--------------------------------------------------------------------------------
Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
Supported devices
--------------------------------------------------------------------------------
- **Officially Supported**: AMD Instinct™ MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
Supported models and features
================================================================================
--------------------------------------------------------------------------------
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
Models:
* Bert
* BERT
* GPT
* T5
* ICT
@@ -54,13 +70,24 @@ Features:
Use cases and recommendations
================================================================================
See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post
to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs.
Coverage includes:
The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:
* Single-GPU pre-training
* Multi-GPU pre-training
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
blog post guides how to leverage the ROCm platform for pre-training using the
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
training. The guide provides step-by-step instructions for setting up the environment,
including cloning the repository, building the Docker image, and running the training container.
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
training workflows for large-scale transformer models.
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
* Single-GPU pre-training
* Multi-GPU pre-training
.. _megatron-lm-docker-compat:
@@ -71,10 +98,9 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
inventories represent the latest Megatron-LM version from the official Docker Hub.
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
Click |docker-icon| to view the image on Docker Hub.
.. list-table::
@@ -82,6 +108,7 @@ Click |docker-icon| to view the image on Docker Hub.
:class: docker-image-compatibility
* - Docker image
- ROCm
- Stanford Megatron-LM
- PyTorch
- Ubuntu
@@ -91,6 +118,7 @@ Click |docker-icon| to view the image on Docker Hub.
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
- 24.04

View File

@@ -2,7 +2,7 @@
.. meta::
:description: Taichi compatibility
:keywords: GPU, Taichi compatibility
:keywords: GPU, Taichi, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -19,28 +19,52 @@ Taichi is widely used across various domains, including real-time physical simul
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
visual effects in film and gaming, and general-purpose computing.
* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
Support overview
================================================================================
.. note::
- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi
<https://github.com/ROCm/taichi>`__ repository, which differs from the
`https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
Taichi is supported on ROCm 6.3.2.
- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`,
which includes ROCm, Taichi, and all required dependencies.
Supported devices and features
===============================================================================
There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichis GPU rendering system, CGUI.
AMD Instinct MI300X Series GPUs will be supported by November.
- See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>`
for installation and setup instructions.
- You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__
for additional context.
Version support
--------------------------------------------------------------------------------
Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
Supported devices
--------------------------------------------------------------------------------
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichis GPU rendering system, CGUI)
- **Upcoming Support**: AMD Instinct™ MI300X
.. _taichi-recommendations:
Use cases and recommendations
================================================================================
To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators.
A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository,
providing practical insights and foundational knowledge for working with the Taichi programming language.
You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs
<https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
blog highlights Taichi as an open-source programming language designed for high-performance
numerical computation, particularly in domains like real-time physical simulation,
artificial intelligence, computer vision, robotics, and visual effects. Taichi
is embedded in Python and uses just-in-time (JIT) compilation frameworks like
LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
of Taichi in enabling complex simulations and numerical algorithms, making
it ideal for developers working on compute-intensive tasks. Developers are
encouraged to follow recommended coding patterns and utilize Taichi decorators
for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples
<https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images
integrating ROCm, PyTorch, and Taichi are provided for simplified installation
and deployment, making it easier to leverage Taichi for advanced computational workloads.
.. _taichi-docker-compat:
@@ -52,9 +76,8 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
represent the latest Taichi version from the official Docker Hub.
The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_.
Click |docker-icon| to view the image on Docker Hub.
.. list-table::

View File

@@ -2,7 +2,7 @@
.. meta::
:description: TensorFlow compatibility
:keywords: GPU, TensorFlow compatibility
:keywords: GPU, TensorFlow, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -12,37 +12,33 @@ TensorFlow compatibility
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
solving machine learning, deep learning, and AI problems. It can solve many
problems across different sectors and industries but primarily focuses on
neural network training and inference. It is one of the most popular and
in-demand frameworks and is very active in open-source contribution and
development.
problems across different sectors and industries, but primarily focuses on
neural network training and inference. It is one of the most popular deep
learning frameworks and is very active in open-source development.
Support overview
================================================================================
- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream
<https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the
`https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`,
which include ROCm, TensorFlow, and all required dependencies.
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
for installation and setup instructions.
- You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list
for additional context.
Version support
--------------------------------------------------------------------------------
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
fixes, updates, and support for the latest ROCM versions.
- ROCm TensorFlow release:
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
ROCm and TensorFlow pre-installed.
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
to get started.
- Official TensorFlow release:
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
.. note::
The official TensorFlow documentation does not cover ROCm support. Use the
ROCm documentation for installation instructions for Tensorflow on ROCm.
See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
fixes, updates, and support for the latest ROCm versions.
.. _tensorflow-docker-compat:

View File

@@ -2,7 +2,7 @@
.. meta::
:description: verl compatibility
:keywords: GPU, verl compatibility
:keywords: GPU, verl, deep learning, framework compatibility
.. version-set:: rocm_version latest
@@ -10,24 +10,58 @@
verl compatibility
*******************************************************************************
Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs).
verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)
is a reinforcement learning framework designed for large language models (LLMs).
verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model
that makes it easy to define and run complex post-training dataflows efficiently.
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
Its modular APIs separate computation from data, allowing smooth integration with other frameworks.
It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
verl achieves high training and generation throughput by building on existing LLM frameworks.
Its 3D-HybridEngine reduces memory use and communication overhead when switching between training
and inference, improving overall performance.
.. note::
Support overview
================================================================================
verl is supported on ROCm 6.2.0.
- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl
<https://github.com/ROCm/verl>`__ repository, which differs from the
`https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
which includes ROCm, verl, and all required dependencies.
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
for installation and setup instructions.
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
for additional context.
Version support
--------------------------------------------------------------------------------
verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
Supported devices
--------------------------------------------------------------------------------
**Officially Supported**: AMD Instinct™ MI300X
.. _verl-recommendations:
Use cases and recommendations
================================================================================
The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
* The benefits of verl in large-scale reinforcement learning from human feedback
(RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
(verl) framework integrates with the AMD ROCm platform to optimize training on
Instinct™ MI300X GPUs. The guide details the process of building a Docker image,
setting up single-node and multi-node training environments, and highlights
performance benchmarks demonstrating improved throughput and convergence accuracy.
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
facilitating efficient RLHF training workflows.
.. _verl-supported_features:
@@ -61,8 +95,10 @@ Docker image compatibility
<i class="fab fa-docker"></i>
AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub.
AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
represent the latest verl version from the official Docker Hub.
Click |docker-icon| to view the image on Docker Hub.
.. list-table::
:header-rows: 1

File diff suppressed because it is too large Load Diff

View File

@@ -15,10 +15,9 @@ using PyTorch. It delves into specific workloads such as
:ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
enhance efficiency.
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
that streamline optimization as well as advanced techniques like
:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
meticulous tuning.
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
for meticulous tuning.
Workload tuning strategy
========================
@@ -86,27 +85,28 @@ Optimize model inference with vLLM
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vLLM provides tools and techniques specifically designed for efficient model
inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
for installation guidance. Optimizing performance with vLLM
involves configuring tensor parallelism, leveraging advanced features, and
ensuring efficient execution. Heres how to optimize vLLM performance:
inference on AMD Instinct GPUs. See the official `vLLM installation docs
<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
installation guidance. Optimizing performance with vLLM involves configuring
tensor parallelism, leveraging advanced features, and ensuring efficient
execution.
* Tensor parallelism: Configure the
:ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
tensor computations across multiple GPUs. Adjust parameters such as
``batch-size``, ``input-len``, and ``output-len`` based on your workload.
* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
according to workload requirements. Benchmark performance to understand
characteristics and identify bottlenecks.
* Configuration for vLLM: Set engine arguments according to workload
requirements.
* Benchmarking and performance metrics: Measure latency and throughput to
evaluate performance.
.. seealso::
See :doc:`vllm-optimization` to learn more about vLLM performance
optimization techniques.
.. _mi300x-auto-tune:
Auto-tunable configurations
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Auto-tunable configurations can significantly streamline performance
optimization by automatically adjusting parameters based on workload
characteristics. For example:
@@ -120,8 +120,7 @@ characteristics. For example:
your specific hardware.
* Triton: Use :ref:`Tritons auto-tuning features <mi300x-autotunable-kernel-config>`
to explore various kernel configurations and automatically select the
best-performing ones.
to explore various kernel configurations and select the best-performing ones.
Manual tuning
^^^^^^^^^^^^^
@@ -328,380 +327,21 @@ hardware counters are also included.
ROCm Systems Profiler timeline trace example.
.. _mi300x-vllm-optimization:
vLLM performance optimization
=============================
vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
Performance environment variables
---------------------------------
The following performance tips are not *specific* to vLLM -- they are general
but relevant in this context. You can tune the following vLLM parameters to
achieve optimal request latency and throughput performance.
* As described in `Environment variables (MI300X)
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
to ``112`` to increase the number of channels on MI300X to potentially improve
performance.
* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
Auto-tuning using PyTorch TunableOp
------------------------------------
Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning.
You can run auto-tuning with TunableOp in two simple steps without modifying your code:
* Enable TunableOp and tuning. Optionally, enable verbose mode:
.. code-block:: shell
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
* Enable TunableOp and disable tuning and measure.
.. code-block:: shell
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
Performance tuning based on vLLM engine configurations
-------------------------------------------------------
The following subsections describe vLLM-specific configurations for performance tuning.
You can tune the following vLLM parameters to achieve optimal performance.
* ``tensor_parallel_size``
* ``gpu_memory_utilization``
* ``dtype``
* ``enforce_eager``
* ``kv_cache_dtype``
* ``input_len``
* ``output_len``
* ``max_num_seqs``
* ``num_scheduler_steps``
* ``max_model_len``
* ``enable_chunked_prefill``
* ``distributed_executor_backend``
* ``max_seq_len_to_capture``
Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
usage with ROCm.
ROCm provides a prebuilt optimized Docker image for validating the performance
of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
ROCm, vLLM, and PyTorch. For more information, see
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
.. _mi300x-vllm-throughput-measurement:
Evaluating performance by throughput measurement
-------------------------------------------------
This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
* For realistic performance evaluation, you can use datasets like Hugging Face's
``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
data, making it a good representation of typical use cases for language models. Download it using
the following command:
.. code-block:: shell
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
* For standardized benchmarking, you can set fixed input and output token
lengths. Synthetic prompts provide consistent benchmarking runs, making it
easier to compare performance across different models or configurations.
Additionally, a controlled environment simplifies analysis.
By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
.. _mi300x-vllm-single-node:
Maximizing vLLM instances on a single node
------------------------------------------
The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
variable ``CUDA_VISIBLE_DEVICES``.
For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
with a model that can fit in one GPU:
.. code-block:: shell
for i in $(seq 0 7);
do
CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
done
The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
using the ``-tp`` N option, where ``1 < N ≤ 8``).
vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
.. _mi300x-vllm-gpu-memory-utilization:
Configure the gpu_memory_utilization parameter
----------------------------------------------
There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
You can set it to ``>0.9`` and ``<1``.
For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
.. code-block:: shell
/vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
Specify GPU memory utilization to run as many instances of vLLM as possible on a single
GPU. However, too many instances can result in no memory for KV-cache. For small models, run
multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
long as it would not cause HIP Out Of Memory.
For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
.. code-block:: shell
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
See :ref:`vllm-engine-args` for other performance suggestions.
.. _mi300x-vllm-multiple-gpus:
Run vLLM on multiple GPUs
-------------------------
The two main reasons to use multiple GPUs are:
* The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
* To achieve better latency when using a single GPU is not desirable.
To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
the GPUs.
For example, you can use two GPUs to start an API server on port 8000:
.. code-block:: shell
python -m vllm.entrypoints.api_server --model /path/to/model --dtype
float16 -tp 2 --port 8000 &
To achieve both latency and throughput performance for serving, you can run multiple API servers on
different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
specify the GPUs for each server, for example:
.. code-block:: shell
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
/path/to/model --dtype float16 -tp 2 --port 8000 &
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
/path/to/model --dtype float16 -tp 2 --port 8001 &
Choose an attention backend
---------------------------
vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
requirements:
- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
least once as a warm-up step so Triton can perform auto-tuning before
collecting benchmarking numbers. This is the default setting.
- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
to learn more about Flash Attention with Triton or CK backends.
.. _vllm-engine-args:
vLLM engine arguments
---------------------
The following are configuration suggestions to potentially improve performance with vLLM. See
`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
for a full list of configurable engine arguments.
Configure the max-num-seqs parameter
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
512``). This increases the maximum number of sequences per iteration and can improve throughput.
Use the float16 dtype
^^^^^^^^^^^^^^^^^^^^^
The default data type (``dtype``) is specified in the models configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
Use float16 (``--dtype float16``) for better performance.
Multi-step scheduling
^^^^^^^^^^^^^^^^^^^^^
Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
Distributed executor backend
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
backend (``--distributed_executor_backend mp``) is recommended.
Graph mode max-seq-len-to-capture
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
larger than this, vLLM engine falls back to eager mode. The default is 8192.
When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
An example of long context length model is Qwen2-7b.
Whether to enable chunked prefill
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Another vLLM performance tip is to enable chunked prefill to improve
throughput. Chunked prefill allows large prefills to be chunked into
smaller chunks and batched together with decode requests.
You can enable the feature by specifying ``--enable-chunked-prefill`` in the
command line or setting ``enable_chunked_prefill=True`` in the LLM
constructor. 
As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
you can tune the performance by changing ``max_num_batched_tokens``. By
default, it is set to 512 and optimized for ITL (inter-token latency).
Smaller ``max_num_batched_tokens`` achieves better ITL because there are
fewer prefills interrupting decodes.
Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
token) as you can put more prefill to the batch.
You might experience noticeable throughput improvements when
benchmarking on a single GPU or 8 GPUs using the vLLM throughput
benchmarking script along with the ShareGPT dataset as input.
In the case of fixed ``input-len``/``output-len``, for some configurations,
enabling chunked prefill increases the throughput. For some other
configurations, the throughput may be worse and elicit a need to tune
parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
.. note::
Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
Quantization support
---------------------
Quantization reduces the precision of the models weights and activations, which significantly decreases the memory footprint.
``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
FP8 quantization
^^^^^^^^^^^^^^^^^
`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
AWQ quantization
^^^^^^^^^^^^^^^^
You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
that AWQ support in vLLM is currently underoptimized.
To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
fp8 kv-cached-dtype
^^^^^^^^^^^^^^^^^^^^^^^
Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
of ``kv-cache``. As a result, it reduces the cost required for reading and
writing the ``kv-cache``.
To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
To specify the quantization scaling config, use the
``--quantization-param-path`` parameter. If the parameter is not specified,
the default scaling factor of ``1`` is used, which can lead to less accurate
results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
in the vLLM GitHub repository.
Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
``llama2-7b``.
If building the vLLM using
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
for ``llama2-70b`` scale config, find the file at
``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
runtime.
Below is a sample command to run benchmarking with this feature enabled
for the ``llama2-70b`` model:
.. code-block:: shell
python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
/path/to/llama2-70b-model --kv-cache-dtype "fp8" \
--quantization-param-path \
"/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
--input-len 512 --output-len 256 --num-prompts 500
vLLM is a high-throughput and memory efficient inference and serving engine for
large language models that has gained traction in the AI community for its
performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
how to:
* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
* Benchmark and scale across single-node and multi-node configurations.
.. _mi300x-tunableop:
@@ -946,33 +586,33 @@ for details.
.. code-block:: shell
HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
--a_type f16_r --b_type f8_r --compute_type f32_f16_r \
--initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256
--initialization trig_float --cold_iters 100 --iters 1000 --rotating 256
* Example 2: Benchmark forward epilogues and backward epilogues
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
* ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
* ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``
* ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";``
hipBLASLt auto-tuning using hipblaslt-bench
@@ -1031,26 +671,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates
.. code-block:: python
defaultBenchOptions = {"ProblemType": {
    "TransposeA": 0,
    "TransposeB": 0,
    "ComputeInputDataType": "s",
    "ComputeDataType": "s",
    "DataTypeC": "s",
    "DataTypeD": "s",
    "UseBias": False
}, "TestConfig": {
    "ColdIter": 20,
    "Iter": 100,
    "AlgoMethod": "all",
    "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
    "SolutionIndex": None, # Only works in AlgoMethod index
    "ApiMethod": "cpp",
    "RotatingBuffer": 0,
}, "TuningParameters": {
    "SplitK": [0]
}, "ProblemSizes": []}
defaultCreateLogicOptions = {}  # Currently unused
defaultBenchOptions = {"ProblemType": {
"TransposeA": 0,
"TransposeB": 0,
"ComputeInputDataType": "s",
"ComputeDataType": "s",
"DataTypeC": "s",
"DataTypeD": "s",
"UseBias": False
}, "TestConfig": {
"ColdIter": 20,
"Iter": 100,
"AlgoMethod": "all",
"RequestedSolutions": 2, # Only works in AlgoMethod heuristic
"SolutionIndex": None, # Only works in AlgoMethod index
"ApiMethod": "cpp",
"RotatingBuffer": 0,
}, "TuningParameters": {
"SplitK": [0]
}, "ProblemSizes": []}
defaultCreateLogicOptions = {} # Currently unused
* ``TestConfig``
1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
@@ -1230,7 +870,7 @@ command:
.. code-block:: shell
merge.py original_dir new_tuned_yaml_dir output_dir 
merge.py original_dir new_tuned_yaml_dir output_dir
The following table describes the logic YAML files.
@@ -1833,7 +1473,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
From the IR snippet, you can see ``i32`` data is loaded from global memory to
registers (``%190``). With a few element-wise operations in registers, it is
stored in shared memory (``%269``) for the transpose operation (``%270``), which
stored in shared memory (``%269``) for the transpose operation (``%270``), which
needs data movement across different threads. With the transpose done, it is
loaded from LDS to register again (``%276``), and with a few more
element-wise operations, it is stored to LDS again (``%298``). The last step
@@ -1967,7 +1607,7 @@ something similar to the following:
loaded at: [0x7fd4f100c000-0x7fd4f100e070]
The kernel name and the code object file should be listed. In the
example above, the kernel name is vector_add_assert_trap, but this might
example above, the kernel name is vector_add_assert_trap, but this might
also look like:
.. code-block:: text
@@ -2081,3 +1721,8 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
configuration to two compute streams and two RCCL streams, aligning with this best practice.
Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
topology during startup, reducing the need for extensive manual tuning.
Further reading
===============
* :doc:`vllm-optimization`

View File

@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
.. _healthcheck-install-transferbench:
1. To get started, use the instructions in the `TransferBench documentation
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
or use the following commands:
.. code:: shell
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
CC=hipcc make
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
in the Instinct performance benchmarking documentation for instructions.

View File

@@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
To learn how to migrate workloads from Megatron-LM to Primus with Megatron,

View File

@@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
training <megatron-lm>` workflow. To learn how to migrate workloads from
@@ -183,7 +183,7 @@ Configuration
=============
Primus defines a training configuration in YAML for each model in
`examples/megatron/configs <https://github.com/AMD-AGI/rss/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

View File

@@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend.
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
Primus with the PyTorch torchtitan backend is designed to replace the
:doc:`ROCm PyTorch training <pytorch-training>` workflow. See

View File

@@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
See :doc:`primus-pytorch` for details.

View File

@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe
See the following developer blogs for more in-depth explanations and examples.
* `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_Series_multigpu.html>`_
* `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__
* `Building a decoder transformer model on AMD GPUs — ROCm Blogs
<https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_

View File

@@ -134,6 +134,8 @@ subtrees:
title: Profile and debug
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
title: Workload optimization
- file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
title: vLLM V1 performance optimization
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
title: AI tutorials

View File

@@ -1,4 +1,4 @@
rocm-docs-core==1.26.0
rocm-docs-core==1.27.0
sphinx-reredirects
sphinx-sitemap
sphinxcontrib.datatemplates==0.11.0

View File

@@ -187,8 +187,8 @@ requests==2.32.5
# via
# pygithub
# sphinx
rocm-docs-core==1.26.0
# via -r docs/sphinx/requirements.in
rocm-docs-core==1.27.0
# via -r requirements.in
rpds-py==0.27.1
# via
# jsonschema
@@ -230,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8
sphinx-notfound-page==1.1.0
# via rocm-docs-core
sphinx-reredirects==0.1.6
# via -r docs/sphinx/requirements.in
# via -r requirements.in
sphinx-sitemap==2.9.0
# via -r docs/sphinx/requirements.in
# via -r requirements.in
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-datatemplates==0.11.0
# via -r docs/sphinx/requirements.in
# via -r requirements.in
sphinxcontrib-devhelp==2.0.0
# via sphinx
sphinxcontrib-htmlhelp==2.1.0