mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-08 22:28:06 -05:00
Merge pull request #612 from ROCm/sync-develop-from-external
Sync develop from external for 7.1.0 GA
This commit is contained in:
@@ -130,7 +130,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: hipTensor
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
|
||||
testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -1,10 +1,35 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rccl
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: systemsRepo
|
||||
type: string
|
||||
default: systems_repo
|
||||
- name: systemsSparseCheckoutDir
|
||||
type: string
|
||||
default: 'projects/rocprofiler-sdk'
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -57,19 +82,28 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
testJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- name: downstreamComponentMatrix
|
||||
type: object
|
||||
default:
|
||||
- rocprofiler-sdk:
|
||||
name: rocprofiler-sdk
|
||||
sparseCheckoutDir: ''
|
||||
skipUnifiedBuild: 'false'
|
||||
buildDependsOn:
|
||||
- rccl_build
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rccl_build_${{ job.target }}
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
variables:
|
||||
- group: common
|
||||
@@ -77,17 +111,23 @@ jobs:
|
||||
- name: HIP_ROCCLR_HOME
|
||||
value: $(Build.BinariesDirectory)/rocm
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
submoduleBehaviour: recursive
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
@@ -97,10 +137,14 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
||||
@@ -112,58 +156,87 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraEnvVars:
|
||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||
installLatestCMake: true
|
||||
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraEnvVars:
|
||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||
installLatestCMake: true
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rccl_test_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: rccl_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rccl
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './rccl-UnitTests'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './rccl-UnitTests'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.systemsRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
|
||||
triggerDownstreamJobs: true
|
||||
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||
${{ if parameters.unifiedBuild }}:
|
||||
buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
|
||||
${{ else }}:
|
||||
buildDependsOn: ${{ component.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm-cmake
|
||||
testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
|
||||
@@ -21,13 +21,19 @@ parameters:
|
||||
- libtbb-dev
|
||||
- libtiff-dev
|
||||
- libva-amdgpu-dev
|
||||
- libavcodec-dev
|
||||
- libavformat-dev
|
||||
- libavutil-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- AMDMIGraphX
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
@@ -40,7 +46,10 @@ parameters:
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
- rocFFT
|
||||
- rocJPEG
|
||||
- rocPRIM
|
||||
@@ -57,7 +66,10 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- AMDMIGraphX
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
@@ -70,7 +82,10 @@ parameters:
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
- rocFFT
|
||||
- rocminfo
|
||||
- rocPRIM
|
||||
|
||||
@@ -79,27 +79,27 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
testJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocprofiler_sdk_build_${{ job.target }}
|
||||
- job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
- ${{ build }}_${{ job.os}}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
@@ -107,6 +107,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
@@ -118,6 +119,7 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
@@ -132,6 +134,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DROCPROFILER_BUILD_TESTS=ON
|
||||
@@ -143,6 +146,7 @@ jobs:
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
@@ -158,8 +162,8 @@ jobs:
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocprofiler_sdk_test_${{ job.target }}
|
||||
dependsOn: rocprofiler_sdk_build_${{ job.target }}
|
||||
- job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
@@ -177,6 +181,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
@@ -188,6 +193,7 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
@@ -202,6 +208,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DROCPROFILER_BUILD_TESTS=ON
|
||||
@@ -213,7 +220,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/s/build
|
||||
os: ${{ job.os }}
|
||||
testDir: $(Agent.BuildDirectory)/build
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -13,7 +13,7 @@ parameters:
|
||||
default: ctest
|
||||
- name: testParameters
|
||||
type: string
|
||||
default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
|
||||
default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
|
||||
- name: extraTestParameters
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
@@ -27,6 +27,7 @@ ASICs
|
||||
ASan
|
||||
ASAN
|
||||
ASm
|
||||
Async
|
||||
ATI
|
||||
atomicRMW
|
||||
AddressSanitizer
|
||||
@@ -34,6 +35,7 @@ AlexNet
|
||||
Andrej
|
||||
Arb
|
||||
Autocast
|
||||
autograd
|
||||
BARs
|
||||
BatchNorm
|
||||
BLAS
|
||||
@@ -86,9 +88,11 @@ Conda
|
||||
ConnectX
|
||||
CountOnes
|
||||
CuPy
|
||||
customizable
|
||||
da
|
||||
Dashboarding
|
||||
Dataloading
|
||||
dataflows
|
||||
DBRX
|
||||
DDR
|
||||
DF
|
||||
@@ -130,6 +134,7 @@ ELMo
|
||||
ENDPGM
|
||||
EPYC
|
||||
ESXi
|
||||
EP
|
||||
EoS
|
||||
etcd
|
||||
fas
|
||||
@@ -181,8 +186,9 @@ GPR
|
||||
GPT
|
||||
GPU
|
||||
GPU's
|
||||
GPUDirect
|
||||
GPUs
|
||||
Graphbolt
|
||||
GraphBolt
|
||||
GraphSage
|
||||
GRBM
|
||||
GRE
|
||||
@@ -212,6 +218,7 @@ Haswell
|
||||
Higgs
|
||||
href
|
||||
Hyperparameters
|
||||
HybridEngine
|
||||
Huggingface
|
||||
IB
|
||||
ICD
|
||||
@@ -298,6 +305,7 @@ Makefiles
|
||||
Matplotlib
|
||||
Matrox
|
||||
MaxText
|
||||
MBT
|
||||
Megablocks
|
||||
Megatrends
|
||||
Megatron
|
||||
@@ -307,6 +315,7 @@ Meta's
|
||||
Miniconda
|
||||
MirroredStrategy
|
||||
Mixtral
|
||||
MLA
|
||||
MosaicML
|
||||
MoEs
|
||||
Mooncake
|
||||
@@ -349,6 +358,7 @@ OFED
|
||||
OMM
|
||||
OMP
|
||||
OMPI
|
||||
OOM
|
||||
OMPT
|
||||
OMPX
|
||||
ONNX
|
||||
@@ -394,6 +404,7 @@ Profiler's
|
||||
PyPi
|
||||
Pytest
|
||||
PyTorch
|
||||
QPS
|
||||
Qcycles
|
||||
Qwen
|
||||
RAII
|
||||
@@ -669,6 +680,7 @@ denoised
|
||||
denoises
|
||||
denormalize
|
||||
dequantization
|
||||
dequantized
|
||||
dequantizes
|
||||
deserializers
|
||||
detections
|
||||
@@ -784,6 +796,7 @@ linalg
|
||||
linearized
|
||||
linter
|
||||
linux
|
||||
llm
|
||||
llvm
|
||||
lm
|
||||
localscratch
|
||||
@@ -834,6 +847,7 @@ passthrough
|
||||
pe
|
||||
perfcounter
|
||||
performant
|
||||
piecewise
|
||||
perl
|
||||
pragma
|
||||
pre
|
||||
@@ -980,6 +994,7 @@ tokenizer
|
||||
tokenizes
|
||||
toolchain
|
||||
toolchains
|
||||
topk
|
||||
toolset
|
||||
toolsets
|
||||
torchtitan
|
||||
@@ -1007,6 +1022,7 @@ USM
|
||||
UTCL
|
||||
UTIL
|
||||
utils
|
||||
UX
|
||||
vL
|
||||
variational
|
||||
vdi
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Deep Graph Library (DGL) compatibility
|
||||
:keywords: GPU, DGL compatibility
|
||||
:keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,24 +10,42 @@
|
||||
DGL compatibility
|
||||
********************************************************************************
|
||||
|
||||
Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable
|
||||
Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable
|
||||
Python package for deep learning on graphs. DGL is framework agnostic, meaning
|
||||
if a deep graph model is a component in an end-to-end application, the rest of
|
||||
that if a deep graph model is a component in an end-to-end application, the rest of
|
||||
the logic is implemented using PyTorch.
|
||||
|
||||
* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
|
||||
* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
to install and get started.
|
||||
DGL provides a high-performance graph object that can reside on either CPUs or GPUs.
|
||||
It bundles structural data features for better control and provides a variety of functions
|
||||
for computing with graph objects, including efficient and customizable message passing
|
||||
primitives for Graph Neural Networks.
|
||||
|
||||
|
||||
Supported devices
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
|
||||
- **Partially Supported**: TF32 with AMD Instinct MI250X
|
||||
- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl
|
||||
<https://github.com/ROCm/dgl>`__ repository, which differs from the
|
||||
`https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.
|
||||
|
||||
- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`,
|
||||
which include ROCm, DGL, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
DGL is supported on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X (through `hipBLASlt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`__)
|
||||
- **Partially Supported**: AMD Instinct™ MI250X
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
@@ -35,7 +53,7 @@ Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
|
||||
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
@@ -62,16 +80,17 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest available DGL version from the official Docker Hub.
|
||||
Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: DGL Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- DGL
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
@@ -81,102 +100,106 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
|
||||
Key ROCm libraries for DGL
|
||||
================================================================================
|
||||
|
||||
DGL on ROCm depends on specific libraries that affect its features and performance.
|
||||
Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
|
||||
Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
|
||||
If you prefer to build it yourself, ensure the following dependencies are installed:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- ROCm 6.4.0 Version
|
||||
- Purpose
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- 1.1.0
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- 2.4.0
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- 0.12.0
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- 3.4.0
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- 1.0.18
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- 2.12.0
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- 2.4.0
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- 3.2.0
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- 0.2.3
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- 1.5.0
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- 3.4.0
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- 2.12.0
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- 3.2.0
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
@@ -184,25 +207,25 @@ If you prefer to build it yourself, ensure the following dependencies are instal
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- 2.2.0
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- 0.10.0
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- 0.8.0
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- 1.9.10
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- 3.3.0
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- 1.7.0
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
@@ -211,14 +234,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal
|
||||
Supported features
|
||||
================================================================================
|
||||
|
||||
Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
|
||||
Many functions and methods available upstream are also supported in DGL on ROCm.
|
||||
Instead of listing them all, support is grouped into the following categories to provide a general overview.
|
||||
|
||||
* DGL Base
|
||||
* DGL Backend
|
||||
* DGL Data
|
||||
* DGL Dataloading
|
||||
* DGL DGLGraph
|
||||
* DGL Graph
|
||||
* DGL Function
|
||||
* DGL Ops
|
||||
* DGL Sampling
|
||||
@@ -235,9 +258,9 @@ Instead of listing them all, support is grouped into the following categories to
|
||||
Unsupported features
|
||||
================================================================================
|
||||
|
||||
* Graphbolt
|
||||
* Partial TF32 Support (MI250x only)
|
||||
* Kineto/ ROCTracer integration
|
||||
* GraphBolt
|
||||
* Partial TF32 Support (MI250X only)
|
||||
* Kineto/ROCTracer integration
|
||||
|
||||
|
||||
Unsupported functions
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: FlashInfer deep learning framework compatibility
|
||||
:keywords: GPU, LLM, FlashInfer, compatibility
|
||||
:description: FlashInfer compatibility
|
||||
:keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -11,7 +11,7 @@ FlashInfer compatibility
|
||||
********************************************************************************
|
||||
|
||||
`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator
|
||||
for Large Language Models (LLMs) that provides high-performance implementation of graphics
|
||||
for Large Language Models (LLMs) that provides a high-performance implementation of graphics
|
||||
processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well
|
||||
as advanced performance across diverse scenarios.
|
||||
|
||||
@@ -25,28 +25,30 @@ offers high-performance LLM-specific operators, with easy integration through Py
|
||||
For the latest feature compatibility matrix, refer to the ``README`` of the
|
||||
`https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
|
||||
|
||||
Support for the ROCm port of FlashInfer is available as follows:
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer
|
||||
<https://github.com/ROCm/flashinfer>`__ repository. This location differs from the
|
||||
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_
|
||||
- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer
|
||||
<https://github.com/ROCm/flashinfer>`__ repository, which differs from the
|
||||
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__
|
||||
upstream repository.
|
||||
|
||||
- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`,
|
||||
which includes ROCm, FlashInfer, and all required dependencies.
|
||||
- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`,
|
||||
which include ROCm, FlashInfer, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||
to install and get started.
|
||||
for installation and setup instructions.
|
||||
|
||||
- See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
in the upstream FlashInfer documentation.
|
||||
- You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
.. note::
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Flashinfer is supported on ROCm 6.4.1.
|
||||
FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
|
||||
@@ -78,10 +80,9 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the FlashInfer version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available FlashInfer version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX compatibility
|
||||
:keywords: GPU, JAX, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,42 +10,38 @@
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
|
||||
JAX provides a NumPy-like API, which combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale.
|
||||
`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library
|
||||
for array-oriented numerical computation (similar to NumPy), with automatic differentiation
|
||||
and just-in-time (JIT) compilation to enable high-performance machine learning research.
|
||||
|
||||
JAX uses composable transformations of Python and NumPy through just-in-time
|
||||
(JIT) compilation, automatic vectorization, and parallelization. To learn about
|
||||
JAX, including profiling and optimizations, see the official `JAX documentation
|
||||
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
|
||||
JAX provides an API that combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale. JAX uses composable transformations of Python and NumPy through
|
||||
JIT compilation, automatic vectorization, and parallelization.
|
||||
|
||||
ROCm support for JAX is upstreamed, and users can build the official source code
|
||||
with ROCm support:
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- ROCm JAX release:
|
||||
- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax
|
||||
<https://github.com/ROCm/rocm-jax>`__ repository, which differs from the
|
||||
`https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.
|
||||
|
||||
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
|
||||
with ROCm and JAX preinstalled.
|
||||
- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`,
|
||||
which include ROCm, JAX, and all required dependencies.
|
||||
|
||||
- ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
to get started.
|
||||
- You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__
|
||||
for additional context.
|
||||
|
||||
- Official JAX release:
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
|
||||
|
||||
- See the `AMD GPU (Linux) installation section
|
||||
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
|
||||
the JAX documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -71,7 +67,7 @@ Use cases and recommendations
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discuss techniques for parallelizing the
|
||||
classification task. The blog post discusses techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
@@ -90,9 +86,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD provides preconfigured Docker images with JAX and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
|
||||
recommended way to get started with deep learning with JAX on ROCm.
|
||||
AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
|
||||
For ``jax-community`` images, see `rocm/jax-community
|
||||
<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
|
||||
|
||||
@@ -234,7 +230,7 @@ The ROCm supported data types in JAX are collected in the following table.
|
||||
|
||||
.. note::
|
||||
|
||||
JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
|
||||
JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
|
||||
collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
|
||||
page.
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: llama.cpp deep learning framework compatibility
|
||||
:keywords: GPU, GGML, llama.cpp compatibility
|
||||
:description: llama.cpp compatibility
|
||||
:keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -20,33 +20,32 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first
|
||||
llama.cpp is easy to integrate with other programming environments and is widely
|
||||
adopted across diverse platforms, including consumer devices.
|
||||
|
||||
ROCm support for llama.cpp is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
|
||||
|
||||
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
|
||||
which includes ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
to install and get started.
|
||||
|
||||
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
|
||||
in the upstream llama.cpp documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
|
||||
Supported devices
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
|
||||
- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`__ repository, which differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.
|
||||
|
||||
- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`,
|
||||
which include ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
|
||||
`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -84,9 +83,9 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
|
||||
AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the available llama.cpp versions from the official Docker Hub.
|
||||
inventories represent the latest available llama.cpp versions from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. important::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Megablocks compatibility
|
||||
:keywords: GPU, megablocks, compatibility
|
||||
:keywords: GPU, megablocks, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,28 +10,42 @@
|
||||
Megablocks compatibility
|
||||
********************************************************************************
|
||||
|
||||
Megablocks is a light-weight library for mixture-of-experts (MoE) training.
|
||||
`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library
|
||||
for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training.
|
||||
The core of the system is efficient "dropless-MoE" and standard MoE layers.
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_,
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM
|
||||
<https://github.com/stanford-futuredata/Megatron-LM>`__,
|
||||
where data and pipeline parallel training of MoEs is supported.
|
||||
|
||||
* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled.
|
||||
* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
.. note::
|
||||
- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks
|
||||
<https://github.com/ROCm/megablocks>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
|
||||
Megablocks is supported on ROCm 6.3.0.
|
||||
- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`,
|
||||
which includes ROCm, Megablocks, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
|
||||
Supported models and features
|
||||
================================================================================
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
This section summarizes the Megablocks features supported by ROCm.
|
||||
|
||||
@@ -41,20 +55,28 @@ This section summarizes the Megablocks features supported by ROCm.
|
||||
* Mixture-of-Experts
|
||||
* dropless-Mixture-of-Experts
|
||||
|
||||
|
||||
.. _megablocks-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_
|
||||
guide how to leverage the ROCm platform for pre-training using the Megablocks framework.
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
@@ -64,10 +86,9 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available Megablocks version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
:keywords: GPU, PyTorch, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -15,40 +15,42 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
|
||||
to independent compatibility considerations, this results in two distinct
|
||||
release cycles for PyTorch on ROCm:
|
||||
PyTorch provides two high-level features:
|
||||
|
||||
- ROCm PyTorch release:
|
||||
- Tensor computation (like NumPy) with strong GPU acceleration
|
||||
|
||||
- Provides the latest version of ROCm but might not necessarily support the
|
||||
latest stable PyTorch version.
|
||||
- Deep neural networks built on a tape-based autograd system (rapid computation
|
||||
of multiple partial derivatives or gradients)
|
||||
|
||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||
preinstalled.
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository.
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency:
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
to get started.
|
||||
- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch
|
||||
<https://github.com/ROCm/pytorch>`__ repository, which differs from the
|
||||
`https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.
|
||||
|
||||
- Official PyTorch release:
|
||||
- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`,
|
||||
which include ROCm, PyTorch, and all required dependencies.
|
||||
|
||||
- Provides the latest stable version of PyTorch but might not necessarily
|
||||
support the latest ROCm version.
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||
|
||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
|
||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
|
||||
to get started.
|
||||
- You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or
|
||||
`Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.
|
||||
|
||||
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
||||
This approach allows PyTorch to support ROCm without requiring manual code
|
||||
modifications. For more information, see :doc:`HIPIFY <hipify:index>`.
|
||||
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency.
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
|
||||
.. _pytorch-recommendations:
|
||||
|
||||
@@ -78,7 +80,7 @@ Use cases and recommendations
|
||||
GPU.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads It outlines the
|
||||
describes how PyTorch integrates with ROCm for AI workloads. It outlines the
|
||||
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
||||
GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
@@ -89,9 +91,8 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
|
||||
recommended way to get started with deep learning with PyTorch on ROCm.
|
||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
|
||||
To find the right image tag, see the :ref:`PyTorch on ROCm installation
|
||||
documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Ray deep learning framework compatibility
|
||||
:keywords: GPU, Ray compatibility
|
||||
:description: Ray compatibility
|
||||
:keywords: GPU, Ray, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -19,36 +19,35 @@ simplifying machine learning computations.
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||
|
||||
ROCm support for Ray is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
|
||||
|
||||
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for instructions to get started.
|
||||
|
||||
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
|
||||
in the upstream Ray documentation.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
Ray is supported on ROCm 6.4.1.
|
||||
|
||||
Supported devices
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`__ repository, which differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.
|
||||
|
||||
- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`,
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels
|
||||
<https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -88,15 +87,15 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
associated inventories represent the latest Ray version from the official Docker Hub.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
@@ -105,6 +104,7 @@ icon to view the image on Docker Hub.
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Stanford Megatron-LM compatibility
|
||||
:keywords: Stanford, Megatron-LM, compatibility
|
||||
:keywords: Stanford, Megatron-LM, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,34 +10,50 @@
|
||||
Stanford Megatron-LM compatibility
|
||||
********************************************************************************
|
||||
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
|
||||
designed to train massive transformer-based language models efficiently by model and data parallelism.
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed
|
||||
by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_.
|
||||
It is designed to train massive transformer-based language models efficiently by model
|
||||
and data parallelism.
|
||||
|
||||
* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled.
|
||||
* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
|
||||
It provides efficient tensor, pipeline, and sequence-based model parallelism for
|
||||
pre-training transformer-based language models such as GPT (Decoder Only), BERT
|
||||
(Encoder Only), and T5 (Encoder-Decoder).
|
||||
|
||||
.. note::
|
||||
|
||||
Stanford Megatron-LM is supported on ROCm 6.3.0.
|
||||
|
||||
|
||||
Supported Devices
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
|
||||
- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM
|
||||
<https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
|
||||
- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`,
|
||||
which includes ROCm, Stanford Megatron-LM, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
|
||||
Supported models and features
|
||||
================================================================================
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
|
||||
|
||||
Models:
|
||||
|
||||
* Bert
|
||||
* BERT
|
||||
* GPT
|
||||
* T5
|
||||
* ICT
|
||||
@@ -54,13 +70,24 @@ Features:
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post
|
||||
to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs.
|
||||
Coverage includes:
|
||||
The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
@@ -71,10 +98,9 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
@@ -82,6 +108,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
@@ -91,6 +118,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Taichi compatibility
|
||||
:keywords: GPU, Taichi compatibility
|
||||
:keywords: GPU, Taichi, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -19,28 +19,52 @@ Taichi is widely used across various domains, including real-time physical simul
|
||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
||||
visual effects in film and gaming, and general-purpose computing.
|
||||
|
||||
* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
|
||||
* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
.. note::
|
||||
- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi
|
||||
<https://github.com/ROCm/taichi>`__ repository, which differs from the
|
||||
`https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
|
||||
|
||||
Taichi is supported on ROCm 6.3.2.
|
||||
- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`,
|
||||
which includes ROCm, Taichi, and all required dependencies.
|
||||
|
||||
Supported devices and features
|
||||
===============================================================================
|
||||
There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
|
||||
AMD Instinct MI300X Series GPUs will be supported by November.
|
||||
- See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
|
||||
- **Upcoming Support**: AMD Instinct™ MI300X
|
||||
|
||||
.. _taichi-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators.
|
||||
A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository,
|
||||
providing practical insights and foundational knowledge for working with the Taichi programming language.
|
||||
You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
|
||||
|
||||
* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
|
||||
blog highlights Taichi as an open-source programming language designed for high-performance
|
||||
numerical computation, particularly in domains like real-time physical simulation,
|
||||
artificial intelligence, computer vision, robotics, and visual effects. Taichi
|
||||
is embedded in Python and uses just-in-time (JIT) compilation frameworks like
|
||||
LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
|
||||
of Taichi in enabling complex simulations and numerical algorithms, making
|
||||
it ideal for developers working on compute-intensive tasks. Developers are
|
||||
encouraged to follow recommended coding patterns and utilize Taichi decorators
|
||||
for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples
|
||||
<https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images
|
||||
integrating ROCm, PyTorch, and Taichi are provided for simplified installation
|
||||
and deployment, making it easier to leverage Taichi for advanced computational workloads.
|
||||
|
||||
.. _taichi-docker-compat:
|
||||
|
||||
@@ -52,9 +76,8 @@ Docker image compatibility
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest Taichi version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -12,37 +12,33 @@ TensorFlow compatibility
|
||||
|
||||
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
|
||||
solving machine learning, deep learning, and AI problems. It can solve many
|
||||
problems across different sectors and industries but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular and
|
||||
in-demand frameworks and is very active in open-source contribution and
|
||||
development.
|
||||
problems across different sectors and industries, but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular deep
|
||||
learning frameworks and is very active in open-source development.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream
|
||||
<https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the
|
||||
`https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
|
||||
|
||||
- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`,
|
||||
which include ROCm, TensorFlow, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
|
||||
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
||||
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
|
||||
fixes, updates, and support for the latest ROCM versions.
|
||||
|
||||
- ROCm TensorFlow release:
|
||||
|
||||
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
|
||||
ROCm and TensorFlow pre-installed.
|
||||
|
||||
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
to get started.
|
||||
|
||||
- Official TensorFlow release:
|
||||
|
||||
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
|
||||
|
||||
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
|
||||
|
||||
.. note::
|
||||
|
||||
The official TensorFlow documentation does not cover ROCm support. Use the
|
||||
ROCm documentation for installation instructions for Tensorflow on ROCm.
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
|
||||
fixes, updates, and support for the latest ROCm versions.
|
||||
|
||||
.. _tensorflow-docker-compat:
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: verl compatibility
|
||||
:keywords: GPU, verl compatibility
|
||||
:keywords: GPU, verl, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,24 +10,58 @@
|
||||
verl compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
|
||||
Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)
|
||||
is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model
|
||||
that makes it easy to define and run complex post-training dataflows efficiently.
|
||||
|
||||
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
|
||||
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
|
||||
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
|
||||
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
|
||||
Its modular APIs separate computation from data, allowing smooth integration with other frameworks.
|
||||
It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
|
||||
verl achieves high training and generation throughput by building on existing LLM frameworks.
|
||||
Its 3D-HybridEngine reduces memory use and communication overhead when switching between training
|
||||
and inference, improving overall performance.
|
||||
|
||||
.. note::
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
verl is supported on ROCm 6.2.0.
|
||||
- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl
|
||||
<https://github.com/ROCm/verl>`__ repository, which differs from the
|
||||
`https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
|
||||
|
||||
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
|
||||
which includes ROCm, verl, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
|
||||
.. _verl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
|
||||
* The benefits of verl in large-scale reinforcement learning from human feedback
|
||||
(RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD
|
||||
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
|
||||
(verl) framework integrates with the AMD ROCm platform to optimize training on
|
||||
Instinct™ MI300X GPUs. The guide details the process of building a Docker image,
|
||||
setting up single-node and multi-node training environments, and highlights
|
||||
performance benchmarks demonstrating improved throughput and convergence accuracy.
|
||||
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
|
||||
facilitating efficient RLHF training workflows.
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
@@ -61,8 +95,10 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub.
|
||||
AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
1139
docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
Normal file
1139
docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
Normal file
File diff suppressed because it is too large
Load Diff
@@ -15,10 +15,9 @@ using PyTorch. It delves into specific workloads such as
|
||||
:ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
|
||||
enhance efficiency.
|
||||
|
||||
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
|
||||
that streamline optimization as well as advanced techniques like
|
||||
:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
|
||||
meticulous tuning.
|
||||
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
|
||||
well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
|
||||
for meticulous tuning.
|
||||
|
||||
Workload tuning strategy
|
||||
========================
|
||||
@@ -86,27 +85,28 @@ Optimize model inference with vLLM
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
vLLM provides tools and techniques specifically designed for efficient model
|
||||
inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
|
||||
for installation guidance. Optimizing performance with vLLM
|
||||
involves configuring tensor parallelism, leveraging advanced features, and
|
||||
ensuring efficient execution. Here’s how to optimize vLLM performance:
|
||||
inference on AMD Instinct GPUs. See the official `vLLM installation docs
|
||||
<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
|
||||
installation guidance. Optimizing performance with vLLM involves configuring
|
||||
tensor parallelism, leveraging advanced features, and ensuring efficient
|
||||
execution.
|
||||
|
||||
* Tensor parallelism: Configure the
|
||||
:ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
|
||||
tensor computations across multiple GPUs. Adjust parameters such as
|
||||
``batch-size``, ``input-len``, and ``output-len`` based on your workload.
|
||||
|
||||
* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
|
||||
according to workload requirements. Benchmark performance to understand
|
||||
characteristics and identify bottlenecks.
|
||||
* Configuration for vLLM: Set engine arguments according to workload
|
||||
requirements.
|
||||
|
||||
* Benchmarking and performance metrics: Measure latency and throughput to
|
||||
evaluate performance.
|
||||
|
||||
.. seealso::
|
||||
|
||||
See :doc:`vllm-optimization` to learn more about vLLM performance
|
||||
optimization techniques.
|
||||
|
||||
.. _mi300x-auto-tune:
|
||||
|
||||
Auto-tunable configurations
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Auto-tunable configurations can significantly streamline performance
|
||||
optimization by automatically adjusting parameters based on workload
|
||||
characteristics. For example:
|
||||
@@ -120,8 +120,7 @@ characteristics. For example:
|
||||
your specific hardware.
|
||||
|
||||
* Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
|
||||
to explore various kernel configurations and automatically select the
|
||||
best-performing ones.
|
||||
to explore various kernel configurations and select the best-performing ones.
|
||||
|
||||
Manual tuning
|
||||
^^^^^^^^^^^^^
|
||||
@@ -328,380 +327,21 @@ hardware counters are also included.
|
||||
|
||||
ROCm Systems Profiler timeline trace example.
|
||||
|
||||
.. _mi300x-vllm-optimization:
|
||||
|
||||
vLLM performance optimization
|
||||
=============================
|
||||
|
||||
vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
|
||||
its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
|
||||
|
||||
Performance environment variables
|
||||
---------------------------------
|
||||
|
||||
The following performance tips are not *specific* to vLLM -- they are general
|
||||
but relevant in this context. You can tune the following vLLM parameters to
|
||||
achieve optimal request latency and throughput performance.
|
||||
|
||||
* As described in `Environment variables (MI300X)
|
||||
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
|
||||
the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
|
||||
performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
|
||||
|
||||
* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
|
||||
to ``112`` to increase the number of channels on MI300X to potentially improve
|
||||
performance.
|
||||
|
||||
* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
|
||||
|
||||
Auto-tuning using PyTorch TunableOp
|
||||
------------------------------------
|
||||
|
||||
Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning.
|
||||
You can run auto-tuning with TunableOp in two simple steps without modifying your code:
|
||||
|
||||
* Enable TunableOp and tuning. Optionally, enable verbose mode:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
|
||||
|
||||
* Enable TunableOp and disable tuning and measure.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
|
||||
|
||||
Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
|
||||
|
||||
Performance tuning based on vLLM engine configurations
|
||||
-------------------------------------------------------
|
||||
|
||||
The following subsections describe vLLM-specific configurations for performance tuning.
|
||||
You can tune the following vLLM parameters to achieve optimal performance.
|
||||
|
||||
* ``tensor_parallel_size``
|
||||
|
||||
* ``gpu_memory_utilization``
|
||||
|
||||
* ``dtype``
|
||||
|
||||
* ``enforce_eager``
|
||||
|
||||
* ``kv_cache_dtype``
|
||||
|
||||
* ``input_len``
|
||||
|
||||
* ``output_len``
|
||||
|
||||
* ``max_num_seqs``
|
||||
|
||||
* ``num_scheduler_steps``
|
||||
|
||||
* ``max_model_len``
|
||||
|
||||
* ``enable_chunked_prefill``
|
||||
|
||||
* ``distributed_executor_backend``
|
||||
|
||||
* ``max_seq_len_to_capture``
|
||||
|
||||
Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
|
||||
for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
|
||||
usage with ROCm.
|
||||
|
||||
ROCm provides a prebuilt optimized Docker image for validating the performance
|
||||
of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
|
||||
ROCm, vLLM, and PyTorch. For more information, see
|
||||
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
|
||||
|
||||
.. _mi300x-vllm-throughput-measurement:
|
||||
|
||||
Evaluating performance by throughput measurement
|
||||
-------------------------------------------------
|
||||
|
||||
This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
|
||||
|
||||
Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
|
||||
Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
|
||||
|
||||
* For realistic performance evaluation, you can use datasets like Hugging Face's
|
||||
``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
|
||||
data, making it a good representation of typical use cases for language models. Download it using
|
||||
the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
* For standardized benchmarking, you can set fixed input and output token
|
||||
lengths. Synthetic prompts provide consistent benchmarking runs, making it
|
||||
easier to compare performance across different models or configurations.
|
||||
Additionally, a controlled environment simplifies analysis.
|
||||
|
||||
By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
|
||||
|
||||
.. _mi300x-vllm-single-node:
|
||||
|
||||
Maximizing vLLM instances on a single node
|
||||
------------------------------------------
|
||||
|
||||
The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
|
||||
However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
|
||||
|
||||
The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
|
||||
For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
|
||||
simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
|
||||
variable ``CUDA_VISIBLE_DEVICES``.
|
||||
|
||||
For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
|
||||
with a model that can fit in one GPU:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
for i in $(seq 0 7);
|
||||
do
|
||||
CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
done
|
||||
|
||||
The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
|
||||
single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
|
||||
using the ``-tp`` N option, where ``1 < N ≤ 8``).
|
||||
|
||||
vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
|
||||
Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
|
||||
|
||||
.. _mi300x-vllm-gpu-memory-utilization:
|
||||
|
||||
Configure the gpu_memory_utilization parameter
|
||||
----------------------------------------------
|
||||
|
||||
There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
|
||||
|
||||
1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
|
||||
it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
|
||||
You can set it to ``>0.9`` and ``<1``.
|
||||
|
||||
For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
/vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
|
||||
|
||||
2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
|
||||
|
||||
Specify GPU memory utilization to run as many instances of vLLM as possible on a single
|
||||
GPU. However, too many instances can result in no memory for KV-cache. For small models, run
|
||||
multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
|
||||
long as it would not cause HIP Out Of Memory.
|
||||
|
||||
For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
|
||||
``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
|
||||
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
|
||||
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
|
||||
See :ref:`vllm-engine-args` for other performance suggestions.
|
||||
|
||||
.. _mi300x-vllm-multiple-gpus:
|
||||
|
||||
Run vLLM on multiple GPUs
|
||||
-------------------------
|
||||
|
||||
The two main reasons to use multiple GPUs are:
|
||||
|
||||
* The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
|
||||
|
||||
* To achieve better latency when using a single GPU is not desirable.
|
||||
|
||||
To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
|
||||
specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
|
||||
the GPUs.
|
||||
|
||||
For example, you can use two GPUs to start an API server on port 8000:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python -m vllm.entrypoints.api_server --model /path/to/model --dtype
|
||||
float16 -tp 2 --port 8000 &
|
||||
|
||||
To achieve both latency and throughput performance for serving, you can run multiple API servers on
|
||||
different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
|
||||
specify the GPUs for each server, for example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
|
||||
/path/to/model --dtype float16 -tp 2 --port 8000 &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
|
||||
/path/to/model --dtype float16 -tp 2 --port 8001 &
|
||||
|
||||
Choose an attention backend
|
||||
---------------------------
|
||||
|
||||
vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
|
||||
requirements:
|
||||
|
||||
- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
|
||||
least once as a warm-up step so Triton can perform auto-tuning before
|
||||
collecting benchmarking numbers. This is the default setting.
|
||||
|
||||
- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
|
||||
the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
|
||||
|
||||
|
||||
Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
|
||||
to learn more about Flash Attention with Triton or CK backends.
|
||||
|
||||
.. _vllm-engine-args:
|
||||
|
||||
vLLM engine arguments
|
||||
---------------------
|
||||
|
||||
The following are configuration suggestions to potentially improve performance with vLLM. See
|
||||
`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
|
||||
for a full list of configurable engine arguments.
|
||||
|
||||
Configure the max-num-seqs parameter
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
|
||||
512``). This increases the maximum number of sequences per iteration and can improve throughput.
|
||||
|
||||
Use the float16 dtype
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
|
||||
Use float16 (``--dtype float16``) for better performance.
|
||||
|
||||
Multi-step scheduling
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
|
||||
|
||||
Distributed executor backend
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
|
||||
backend (``--distributed_executor_backend mp``) is recommended.
|
||||
|
||||
Graph mode max-seq-len-to-capture
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
|
||||
larger than this, vLLM engine falls back to eager mode. The default is 8192.
|
||||
|
||||
When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
|
||||
See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
|
||||
|
||||
An example of long context length model is Qwen2-7b.
|
||||
|
||||
Whether to enable chunked prefill
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Another vLLM performance tip is to enable chunked prefill to improve
|
||||
throughput. Chunked prefill allows large prefills to be chunked into
|
||||
smaller chunks and batched together with decode requests.
|
||||
|
||||
You can enable the feature by specifying ``--enable-chunked-prefill`` in the
|
||||
command line or setting ``enable_chunked_prefill=True`` in the LLM
|
||||
constructor.
|
||||
|
||||
As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
|
||||
you can tune the performance by changing ``max_num_batched_tokens``. By
|
||||
default, it is set to 512 and optimized for ITL (inter-token latency).
|
||||
Smaller ``max_num_batched_tokens`` achieves better ITL because there are
|
||||
fewer prefills interrupting decodes.
|
||||
Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
|
||||
token) as you can put more prefill to the batch.
|
||||
|
||||
You might experience noticeable throughput improvements when
|
||||
benchmarking on a single GPU or 8 GPUs using the vLLM throughput
|
||||
benchmarking script along with the ShareGPT dataset as input.
|
||||
|
||||
In the case of fixed ``input-len``/``output-len``, for some configurations,
|
||||
enabling chunked prefill increases the throughput. For some other
|
||||
configurations, the throughput may be worse and elicit a need to tune
|
||||
parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
|
||||
|
||||
.. note::
|
||||
|
||||
Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
|
||||
|
||||
Quantization support
|
||||
---------------------
|
||||
|
||||
Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
|
||||
``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
|
||||
|
||||
FP8 quantization
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
|
||||
Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
|
||||
|
||||
AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
|
||||
|
||||
* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
|
||||
* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
|
||||
* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
|
||||
* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
|
||||
* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
|
||||
|
||||
To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
|
||||
|
||||
AWQ quantization
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
|
||||
that AWQ support in vLLM is currently underoptimized.
|
||||
|
||||
To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
|
||||
|
||||
You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
|
||||
|
||||
fp8 kv-cached-dtype
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
|
||||
of ``kv-cache``. As a result, it reduces the cost required for reading and
|
||||
writing the ``kv-cache``.
|
||||
|
||||
To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
|
||||
|
||||
To specify the quantization scaling config, use the
|
||||
``--quantization-param-path`` parameter. If the parameter is not specified,
|
||||
the default scaling factor of ``1`` is used, which can lead to less accurate
|
||||
results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
|
||||
Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
|
||||
in the vLLM GitHub repository.
|
||||
|
||||
Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
|
||||
``llama2-7b``.
|
||||
|
||||
If building the vLLM using
|
||||
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
|
||||
for ``llama2-70b`` scale config, find the file at
|
||||
``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
|
||||
runtime.
|
||||
|
||||
Below is a sample command to run benchmarking with this feature enabled
|
||||
for the ``llama2-70b`` model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
|
||||
/path/to/llama2-70b-model --kv-cache-dtype "fp8" \
|
||||
--quantization-param-path \
|
||||
"/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
|
||||
--input-len 512 --output-len 256 --num-prompts 500
|
||||
|
||||
vLLM is a high-throughput and memory efficient inference and serving engine for
|
||||
large language models that has gained traction in the AI community for its
|
||||
performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
|
||||
how to:
|
||||
|
||||
* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
|
||||
* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
|
||||
* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
|
||||
* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
|
||||
* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
|
||||
* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
|
||||
* Benchmark and scale across single-node and multi-node configurations.
|
||||
|
||||
.. _mi300x-tunableop:
|
||||
|
||||
@@ -946,33 +586,33 @@ for details.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
|
||||
HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
|
||||
--a_type f16_r --b_type f8_r --compute_type f32_f16_r \
|
||||
--initialization trig_float --cold_iters 100 --iters 1000 --rotating 256
|
||||
--initialization trig_float --cold_iters 100 --iters 1000 --rotating 256
|
||||
|
||||
* Example 2: Benchmark forward epilogues and backward epilogues
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
|
||||
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";``
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";``
|
||||
|
||||
|
||||
hipBLASLt auto-tuning using hipblaslt-bench
|
||||
@@ -1031,26 +671,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
defaultBenchOptions = {"ProblemType": {
|
||||
"TransposeA": 0,
|
||||
"TransposeB": 0,
|
||||
"ComputeInputDataType": "s",
|
||||
"ComputeDataType": "s",
|
||||
"DataTypeC": "s",
|
||||
"DataTypeD": "s",
|
||||
"UseBias": False
|
||||
}, "TestConfig": {
|
||||
"ColdIter": 20,
|
||||
"Iter": 100,
|
||||
"AlgoMethod": "all",
|
||||
"RequestedSolutions": 2, # Only works in AlgoMethod heuristic
|
||||
"SolutionIndex": None, # Only works in AlgoMethod index
|
||||
"ApiMethod": "cpp",
|
||||
"RotatingBuffer": 0,
|
||||
}, "TuningParameters": {
|
||||
"SplitK": [0]
|
||||
}, "ProblemSizes": []}
|
||||
defaultCreateLogicOptions = {} # Currently unused
|
||||
defaultBenchOptions = {"ProblemType": {
|
||||
"TransposeA": 0,
|
||||
"TransposeB": 0,
|
||||
"ComputeInputDataType": "s",
|
||||
"ComputeDataType": "s",
|
||||
"DataTypeC": "s",
|
||||
"DataTypeD": "s",
|
||||
"UseBias": False
|
||||
}, "TestConfig": {
|
||||
"ColdIter": 20,
|
||||
"Iter": 100,
|
||||
"AlgoMethod": "all",
|
||||
"RequestedSolutions": 2, # Only works in AlgoMethod heuristic
|
||||
"SolutionIndex": None, # Only works in AlgoMethod index
|
||||
"ApiMethod": "cpp",
|
||||
"RotatingBuffer": 0,
|
||||
}, "TuningParameters": {
|
||||
"SplitK": [0]
|
||||
}, "ProblemSizes": []}
|
||||
defaultCreateLogicOptions = {} # Currently unused
|
||||
|
||||
* ``TestConfig``
|
||||
1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
|
||||
@@ -1230,7 +870,7 @@ command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
merge.py original_dir new_tuned_yaml_dir output_dir
|
||||
merge.py original_dir new_tuned_yaml_dir output_dir
|
||||
|
||||
The following table describes the logic YAML files.
|
||||
|
||||
@@ -1833,7 +1473,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
|
||||
|
||||
From the IR snippet, you can see ``i32`` data is loaded from global memory to
|
||||
registers (``%190``). With a few element-wise operations in registers, it is
|
||||
stored in shared memory (``%269``) for the transpose operation (``%270``), which
|
||||
stored in shared memory (``%269``) for the transpose operation (``%270``), which
|
||||
needs data movement across different threads. With the transpose done, it is
|
||||
loaded from LDS to register again (``%276``), and with a few more
|
||||
element-wise operations, it is stored to LDS again (``%298``). The last step
|
||||
@@ -1967,7 +1607,7 @@ something similar to the following:
|
||||
loaded at: [0x7fd4f100c000-0x7fd4f100e070]
|
||||
|
||||
The kernel name and the code object file should be listed. In the
|
||||
example above, the kernel name is vector_add_assert_trap, but this might
|
||||
example above, the kernel name is vector_add_assert_trap, but this might
|
||||
also look like:
|
||||
|
||||
.. code-block:: text
|
||||
@@ -2081,3 +1721,8 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
|
||||
configuration to two compute streams and two RCCL streams, aligning with this best practice.
|
||||
Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
|
||||
topology during startup, reducing the need for extensive manual tuning.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
* :doc:`vllm-optimization`
|
||||
|
||||
@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
|
||||
.. _healthcheck-install-transferbench:
|
||||
|
||||
1. To get started, use the instructions in the `TransferBench documentation
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
|
||||
or use the following commands:
|
||||
|
||||
.. code:: shell
|
||||
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
|
||||
CC=hipcc make
|
||||
|
||||
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
|
||||
@@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm
|
||||
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
|
||||
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
|
||||
|
||||
Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
|
||||
To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
|
||||
|
||||
@@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo
|
||||
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
|
||||
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
|
||||
|
||||
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
|
||||
training <megatron-lm>` workflow. To learn how to migrate workloads from
|
||||
@@ -183,7 +183,7 @@ Configuration
|
||||
=============
|
||||
|
||||
Primus defines a training configuration in YAML for each model in
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/rss/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend.
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
|
||||
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||
:doc:`ROCm PyTorch training <pytorch-training>` workflow. See
|
||||
|
||||
@@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
|
||||
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||
|
||||
See :doc:`primus-pytorch` for details.
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe
|
||||
|
||||
See the following developer blogs for more in-depth explanations and examples.
|
||||
|
||||
* `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_Series_multigpu.html>`_
|
||||
* `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__
|
||||
|
||||
* `Building a decoder transformer model on AMD GPUs — ROCm Blogs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
|
||||
|
||||
@@ -134,6 +134,8 @@ subtrees:
|
||||
title: Profile and debug
|
||||
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||
title: Workload optimization
|
||||
- file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
|
||||
title: vLLM V1 performance optimization
|
||||
|
||||
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||
title: AI tutorials
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.26.0
|
||||
rocm-docs-core==1.27.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -187,8 +187,8 @@ requests==2.32.5
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.26.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
rocm-docs-core==1.27.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.27.1
|
||||
# via
|
||||
# jsonschema
|
||||
@@ -230,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8
|
||||
sphinx-notfound-page==1.1.0
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.6
|
||||
# via -r docs/sphinx/requirements.in
|
||||
# via -r requirements.in
|
||||
sphinx-sitemap==2.9.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
# via -r requirements.in
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-datatemplates==0.11.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
# via -r requirements.in
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
|
||||
Reference in New Issue
Block a user