mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 15:18:11 -05:00
Compare commits
4 Commits
update_jax
...
cvs-docs
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
21b1cfd967 | ||
|
|
58d932f0fd | ||
|
|
d9c2cd6047 | ||
|
|
b0960ee73e |
@@ -128,9 +128,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.28.6'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -155,7 +152,6 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
|
||||
-DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
|
||||
@@ -196,9 +192,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.28.6'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -224,7 +217,6 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
|
||||
-DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: amdsmi
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -50,7 +31,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
- job: amdsmi_build_${{ job.os }}
|
||||
pool:
|
||||
${{ if eq(job.os, 'ubuntu2404') }}:
|
||||
vmImage: 'ubuntu-24.04'
|
||||
@@ -74,7 +55,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
@@ -85,54 +65,50 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
componentName: ${{ parameters.componentName }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
# parameters:
|
||||
# aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: amdsmi_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: amdsmi_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: amdsmi
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: hipTensor
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -70,7 +51,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
- job: hipTensor_build_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -85,15 +66,12 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
extraBuildFlags: >-
|
||||
@@ -107,12 +85,9 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -120,47 +95,44 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
timeoutInMinutes: 90
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
|
||||
testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: hipTensor_test_${{ job.target }}
|
||||
timeoutInMinutes: 90
|
||||
dependsOn: hipTensor_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: hipTensor
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
|
||||
testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,35 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rccl
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: systemsRepo
|
||||
type: string
|
||||
default: systems_repo
|
||||
- name: systemsSparseCheckoutDir
|
||||
type: string
|
||||
default: 'projects/rocprofiler-sdk'
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -82,28 +57,19 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- name: downstreamComponentMatrix
|
||||
type: object
|
||||
default:
|
||||
- rocprofiler-sdk:
|
||||
name: rocprofiler-sdk
|
||||
sparseCheckoutDir: ''
|
||||
skipUnifiedBuild: 'false'
|
||||
buildDependsOn:
|
||||
- rccl_build
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||
- job: rccl_build_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
variables:
|
||||
- group: common
|
||||
@@ -111,23 +77,17 @@ jobs:
|
||||
- name: HIP_ROCCLR_HOME
|
||||
value: $(Build.BinariesDirectory)/rocm
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
submoduleBehaviour: recursive
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
@@ -137,14 +97,10 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
||||
@@ -156,87 +112,58 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraEnvVars:
|
||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||
installLatestCMake: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraEnvVars:
|
||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||
installLatestCMake: true
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './rccl-UnitTests'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.systemsRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
|
||||
triggerDownstreamJobs: true
|
||||
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||
${{ if parameters.unifiedBuild }}:
|
||||
buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
|
||||
${{ else }}:
|
||||
buildDependsOn: ${{ component.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rccl_test_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: rccl_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rccl
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './rccl-UnitTests'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocWMMA
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -85,11 +66,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
- job: rocWMMA_build_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -104,7 +81,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
@@ -126,12 +102,9 @@ jobs:
|
||||
# gfx1030 not supported in documentation
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -139,45 +112,43 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
timeoutInMinutes: 350
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocWMMA_test_${{ job.target }}
|
||||
timeoutInMinutes: 270
|
||||
dependsOn: rocWMMA_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocWMMA
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm-cmake
|
||||
testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
|
||||
@@ -17,38 +17,21 @@ parameters:
|
||||
- libdw-dev
|
||||
- libglfw3-dev
|
||||
- libmsgpack-dev
|
||||
- libomp-dev
|
||||
- libopencv-dev
|
||||
- libtbb-dev
|
||||
- libtiff-dev
|
||||
- libva-amdgpu-dev
|
||||
- libva2-amdgpu
|
||||
- mesa-amdgpu-va-drivers
|
||||
- libavcodec-dev
|
||||
- libavformat-dev
|
||||
- libavutil-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- protobuf-compiler
|
||||
- libprotoc-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
- future==1.0.0
|
||||
- pytz==2022.1
|
||||
- numpy==1.23
|
||||
- google==3.0.0
|
||||
- protobuf==3.12.4
|
||||
- onnx==1.12.0
|
||||
- nnef==1.0.7
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- AMDMIGraphX
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
@@ -62,9 +45,6 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
- rocFFT
|
||||
@@ -83,11 +63,7 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- AMDMIGraphX
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
@@ -101,9 +77,6 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
- rocFFT
|
||||
@@ -148,7 +121,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
@@ -248,6 +220,5 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -65,13 +65,6 @@ parameters:
|
||||
- pytest
|
||||
- pytest-cov
|
||||
- pytest-xdist
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-sdk
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -108,12 +101,10 @@ jobs:
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
||||
workspace:
|
||||
@@ -128,14 +119,6 @@ jobs:
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
extraBuildFlags: >-
|
||||
|
||||
@@ -79,27 +79,27 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
|
||||
- job: rocprofiler_sdk_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os}}_${{ job.target }}
|
||||
- ${{ build }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
@@ -107,7 +107,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
@@ -119,7 +118,6 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
@@ -134,7 +132,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DROCPROFILER_BUILD_TESTS=ON
|
||||
@@ -146,7 +143,6 @@ jobs:
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
@@ -162,8 +158,8 @@ jobs:
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
|
||||
- job: rocprofiler_sdk_test_${{ job.target }}
|
||||
dependsOn: rocprofiler_sdk_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
@@ -181,7 +177,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
@@ -193,7 +188,6 @@ jobs:
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
@@ -208,7 +202,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DROCPROFILER_BUILD_TESTS=ON
|
||||
@@ -220,8 +213,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: $(Agent.BuildDirectory)/build
|
||||
testDir: $(Agent.BuildDirectory)/s/build
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -63,7 +63,6 @@ parameters:
|
||||
libopenblas-dev: openblas-devel
|
||||
libopenmpi-dev: openmpi-devel
|
||||
libpci-dev: libpciaccess-devel
|
||||
libsimde-dev: simde-devel
|
||||
libssl-dev: openssl-devel
|
||||
# note: libstdc++-devel is in the base packages list
|
||||
libsystemd-dev: systemd-devel
|
||||
|
||||
@@ -35,8 +35,8 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
amdsmi:
|
||||
pipelineId: 376
|
||||
developBranch: develop
|
||||
pipelineId: 99
|
||||
developBranch: amd-staging
|
||||
hasGpuTarget: false
|
||||
aomp-extras:
|
||||
pipelineId: 111
|
||||
@@ -115,7 +115,7 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
hipTensor:
|
||||
pipelineId: 374
|
||||
pipelineId: 105
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
llvm-project:
|
||||
@@ -263,7 +263,7 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocWMMA:
|
||||
pipelineId: 370
|
||||
pipelineId: 109
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rpp:
|
||||
|
||||
@@ -13,7 +13,7 @@ parameters:
|
||||
default: ctest
|
||||
- name: testParameters
|
||||
type: string
|
||||
default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
|
||||
default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
|
||||
- name: extraTestParameters
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,7 +1,6 @@
|
||||
.venv
|
||||
.vscode
|
||||
build
|
||||
__pycache__
|
||||
|
||||
# documentation artifacts
|
||||
_build/
|
||||
|
||||
@@ -27,7 +27,6 @@ ASICs
|
||||
ASan
|
||||
ASAN
|
||||
ASm
|
||||
Async
|
||||
ATI
|
||||
atomicRMW
|
||||
AddressSanitizer
|
||||
@@ -35,8 +34,6 @@ AlexNet
|
||||
Andrej
|
||||
Arb
|
||||
Autocast
|
||||
autograd
|
||||
Backported
|
||||
BARs
|
||||
BatchNorm
|
||||
BLAS
|
||||
@@ -80,7 +77,6 @@ CX
|
||||
Cavium
|
||||
CentOS
|
||||
ChatGPT
|
||||
Cholesky
|
||||
CoRR
|
||||
Codespaces
|
||||
Commitizen
|
||||
@@ -90,11 +86,9 @@ Conda
|
||||
ConnectX
|
||||
CountOnes
|
||||
CuPy
|
||||
customizable
|
||||
da
|
||||
Dashboarding
|
||||
Dataloading
|
||||
dataflows
|
||||
DBRX
|
||||
DDR
|
||||
DF
|
||||
@@ -136,12 +130,10 @@ ELMo
|
||||
ENDPGM
|
||||
EPYC
|
||||
ESXi
|
||||
EP
|
||||
EoS
|
||||
etcd
|
||||
fas
|
||||
FBGEMM
|
||||
FiLM
|
||||
FIFOs
|
||||
FFT
|
||||
FFTs
|
||||
@@ -162,12 +154,10 @@ Fortran
|
||||
Fuyu
|
||||
GALB
|
||||
GAT
|
||||
GATNE
|
||||
GCC
|
||||
GCD
|
||||
GCDs
|
||||
GCN
|
||||
GCNN
|
||||
GDB
|
||||
GDDR
|
||||
GDR
|
||||
@@ -186,16 +176,13 @@ Glibc
|
||||
GLXT
|
||||
Gloo
|
||||
GMI
|
||||
GNN
|
||||
GNNs
|
||||
GPG
|
||||
GPR
|
||||
GPT
|
||||
GPU
|
||||
GPU's
|
||||
GPUDirect
|
||||
GPUs
|
||||
GraphBolt
|
||||
Graphbolt
|
||||
GraphSage
|
||||
GRBM
|
||||
GRE
|
||||
@@ -203,11 +190,9 @@ GenAI
|
||||
GenZ
|
||||
GitHub
|
||||
Gitpod
|
||||
hardcoded
|
||||
HBM
|
||||
HCA
|
||||
HGX
|
||||
HLO
|
||||
HIPCC
|
||||
hipDataType
|
||||
HIPExtension
|
||||
@@ -227,7 +212,6 @@ Haswell
|
||||
Higgs
|
||||
href
|
||||
Hyperparameters
|
||||
HybridEngine
|
||||
Huggingface
|
||||
IB
|
||||
ICD
|
||||
@@ -259,7 +243,6 @@ Intersphinx
|
||||
Intra
|
||||
Ioffe
|
||||
JAX's
|
||||
JAXLIB
|
||||
Jinja
|
||||
JSON
|
||||
Jupyter
|
||||
@@ -280,7 +263,6 @@ LLM
|
||||
LLMs
|
||||
LLVM
|
||||
LM
|
||||
logsumexp
|
||||
LRU
|
||||
LSAN
|
||||
LSan
|
||||
@@ -316,7 +298,6 @@ Makefiles
|
||||
Matplotlib
|
||||
Matrox
|
||||
MaxText
|
||||
MBT
|
||||
Megablocks
|
||||
Megatrends
|
||||
Megatron
|
||||
@@ -326,15 +307,12 @@ Meta's
|
||||
Miniconda
|
||||
MirroredStrategy
|
||||
Mixtral
|
||||
MLA
|
||||
MosaicML
|
||||
MoEs
|
||||
Mooncake
|
||||
Mpops
|
||||
Multicore
|
||||
multihost
|
||||
Multithreaded
|
||||
mx
|
||||
MXFP
|
||||
MyEnvironment
|
||||
MyST
|
||||
@@ -371,7 +349,6 @@ OFED
|
||||
OMM
|
||||
OMP
|
||||
OMPI
|
||||
OOM
|
||||
OMPT
|
||||
OMPX
|
||||
ONNX
|
||||
@@ -398,7 +375,6 @@ perf
|
||||
PEQT
|
||||
PIL
|
||||
PILImage
|
||||
PJRT
|
||||
POR
|
||||
PRNG
|
||||
PRs
|
||||
@@ -418,7 +394,6 @@ Profiler's
|
||||
PyPi
|
||||
Pytest
|
||||
PyTorch
|
||||
QPS
|
||||
Qcycles
|
||||
Qwen
|
||||
RAII
|
||||
@@ -694,7 +669,6 @@ denoised
|
||||
denoises
|
||||
denormalize
|
||||
dequantization
|
||||
dequantized
|
||||
dequantizes
|
||||
deserializers
|
||||
detections
|
||||
@@ -810,7 +784,6 @@ linalg
|
||||
linearized
|
||||
linter
|
||||
linux
|
||||
llm
|
||||
llvm
|
||||
lm
|
||||
localscratch
|
||||
@@ -861,7 +834,6 @@ passthrough
|
||||
pe
|
||||
perfcounter
|
||||
performant
|
||||
piecewise
|
||||
perl
|
||||
pragma
|
||||
pre
|
||||
@@ -1008,7 +980,6 @@ tokenizer
|
||||
tokenizes
|
||||
toolchain
|
||||
toolchains
|
||||
topk
|
||||
toolset
|
||||
toolsets
|
||||
torchtitan
|
||||
@@ -1024,7 +995,6 @@ uncacheable
|
||||
uncorrectable
|
||||
underoptimized
|
||||
unhandled
|
||||
unfused
|
||||
uninstallation
|
||||
unmapped
|
||||
unsqueeze
|
||||
@@ -1037,7 +1007,6 @@ USM
|
||||
UTCL
|
||||
UTIL
|
||||
utils
|
||||
UX
|
||||
vL
|
||||
variational
|
||||
vdi
|
||||
|
||||
977
CHANGELOG.md
977
CHANGELOG.md
@@ -4,977 +4,6 @@ This page is a historical overview of changes made to ROCm components. This
|
||||
consolidated changelog documents key modifications and improvements across
|
||||
different versions of the ROCm software stack and its components.
|
||||
|
||||
## ROCm 7.1.1
|
||||
|
||||
See the [ROCm 7.1.1 release notes](https://rocm.docs.amd.com/en/docs-7.1.1/about/release-notes.html#rocm-7-1-1-release-notes)
|
||||
for a complete overview of this release.
|
||||
|
||||
### **AMD SMI** (26.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- Caching for repeated ASIC information calls.
|
||||
- The cache added to `amdsmi_get_gpu_asic_info` improves performance by avoiding redundant hardware queries.
|
||||
- The cache stores ASIC info for each GPU device with a configurable duration, defaulting to 10 seconds. Use the `AMDSMI_ASIC_INFO_CACHE_MS` environment variable for cache duration configuration for `amdsmi_get_gpu_asic_info` API calls.
|
||||
|
||||
- Support for GPU partition metrics.
|
||||
- Provides support for `xcp_metrics` v1.0 and extends support for v1.1 (dynamic metrics).
|
||||
- Added `amdsmi_get_gpu_partition_metrics_info`, which provides per XCP (partition) metrics.
|
||||
|
||||
- Support for displaying newer VRAM memory types in `amd-smi static --vram`.
|
||||
- The `amdsmi_get_gpu_vram_info()` API now supports detecting DDR5, LPDDR4, LPDDR5, and HBM3E memory types.
|
||||
|
||||
#### Changed
|
||||
|
||||
- Updated `amd-smi static --numa` socket affinity data structure. It now displays CPU affinity information in both hexadecimal bitmask format and expanded CPU core ranges, replacing the previous simplified socket enumeration approach.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed incorrect topology weight calculations.
|
||||
- Out-of-bound writes caused corruption in the weights field.
|
||||
|
||||
- Fixed `amd-smi event` not respecting the Linux timeout command.
|
||||
|
||||
- Fixed an issue where `amdsmi_get_power_info` returned `AMDSMI_STATUS_API_FAILED`.
|
||||
- VMs were incorrectly reporting `AMDSMI_STATUS_API_FAILED` when unable to get the power cap within the `amdsmi_get_power_info`.
|
||||
- The API now returns `N/A` or `UINT_MAX` for values that can't be retrieved, instead of failing.
|
||||
|
||||
- Fixed output for `amd-smi xgmi -l --json`.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-711) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Composable Kernel will adopt C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment meets this requirement to facilitate a seamless transition.
|
||||
|
||||
### **HIP** (7.1.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for the flag `hipHostRegisterIoMemory` in `hipHostRegister`, used to register I/O memory with HIP runtime so the GPU can access it.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Incorrect Compute Unit (CU) mask in logging. HIP runtime now correctly sets the field width for the output print operation. When logging is enabled via the environment variable `AMD_LOG_LEVEL`, the runtime logs the accurate CU mask.
|
||||
* A segmentation fault occurred when the dynamic queue management mechanism was enabled. HIP runtime now ensures GPU queues aren't `NULL` during marker submission, preventing crashes and improving robustness.
|
||||
* An error encountered on HIP tear-down after device reset in certain applications due to accessing stale memory objects. HIP runtime now properly releases memory associated with host calls, ensuring reliable device resets.
|
||||
* A race condition occurred in certain graph-related applications when pending asynchronous signal handlers referenced device memory that had already been released, leading to memory corruption. HIP runtime now uses a reference counting strategy to manage access to device objects in asynchronous event handlers, ensuring safe and reliable memory usage.
|
||||
|
||||
### **MIGraphX** (2.14.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed an error that resulted when running `make check` on systems running on a gfx1201 GPU.
|
||||
|
||||
### **RCCL** (2.27.7)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed a single-node data corruption issue in MSCCL on the AMD Instinct MI350X and MI355X GPUs for the LL protocol. This previously affected about two percent of the runs for single-node `AllReduce` with inputs smaller than 512 KiB.
|
||||
|
||||
### **rocBLAS** (5.1.1)
|
||||
|
||||
#### Changed
|
||||
* By default, rocBLAS will not use stream order allocation for its internal workspace. To enable this behavior, set the `ROCBLAS_STREAM_ORDER_ALLOC` environment variable.
|
||||
|
||||
### **ROCm Bandwidth Test** (2.6.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Test failure with error message `Cannot make canonical path`.
|
||||
- Healthcheck test failure with seg fault on gfx942.
|
||||
- Segmentation fault observed in `schmoo` and `one2all` when executed on `sgpu` setup.
|
||||
|
||||
#### Known issues
|
||||
|
||||
- `rocm-bandwidth-test` folder fails to be removed after driver uninstallation:
|
||||
* After running `amdgpu-uninstall`, the `rocm-bandwidth-test` folder and package are still present.
|
||||
* Workaround: Remove the package manually using:
|
||||
```
|
||||
sudo apt-get remove -y rocm-bandwidth-test
|
||||
```
|
||||
|
||||
### **ROCm Compute Profiler** (3.3.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for PC sampling of multi-kernel applications.
|
||||
* PC Sampling output instructions are displayed with the name of the kernel to which the individual instruction belongs.
|
||||
* Single kernel selection is supported so that the PC samples of the selected kernel can be displayed.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Roofline analysis now runs on GPU 0 by default instead of all GPUs.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved roofline benchmarking by updating the `flops_benchmark` calculation.
|
||||
|
||||
* Improved standalone roofline plots in profile mode (PDF output) and analyze mode (CLI and GUI visual plots):
|
||||
* Fixed the peak MFMA/VALU lines being cut off.
|
||||
* Cleaned up the overlapping roofline numeric values by moving them into the side legend.
|
||||
* Added AI points chart with respective values, cache level, and compute/memory bound status.
|
||||
* Added full kernel names to the symbol chart.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Resolved existing issues to improve stability.
|
||||
|
||||
### **ROCm Systems Profiler** (1.2.1)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed an issue of OpenMP Tools (OMPT) events, GPU performance counters, VA-API, MPI, and host events failing to be collected in the `rocpd` output.
|
||||
|
||||
### **ROCm Validation Suite** (1.3.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for different test levels with `-r` option for AMD Instinct MI3XXx GPUs.
|
||||
* Set compute type for DGEMM operations on AMD Instinct MI350X and MI355X GPUs.
|
||||
|
||||
### **rocSHMEM** (3.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Allowed IPC, RO, and GDA backends to be selected at runtime.
|
||||
* GDA conduit for different NIC vendors:
|
||||
* Broadcom BNXT\_RE (Thor 2)
|
||||
* Mellanox MLX5 (IB and RoCE ConnectX-7)
|
||||
* New APIs:
|
||||
* `rocshmem_get_device_ctx`
|
||||
|
||||
#### Changed
|
||||
|
||||
* The following APIs have been deprecated:
|
||||
* `rocshmem_wg_init`
|
||||
* `rocshmem_wg_finalize`
|
||||
* `rocshmem_wg_init_thread`
|
||||
|
||||
* `rocshmem_ptr` can now return non-null pointer to a shared memory region when the IPC transport is available to reach that region. Previously, it would return a null pointer.
|
||||
* `ROCSHMEM_RO_DISABLE_IPC` is renamed to `ROCSHMEM_DISABLE_MIXED_IPC`.
|
||||
- This environment variable wasn't documented in earlier releases. It's now documented.
|
||||
|
||||
#### Removed
|
||||
|
||||
* rocSHMEM no longer requires rocPRIM and rocThrust as dependencies.
|
||||
* Removed MPI compile-time dependency.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Only a subset of rocSHMEM APIs are implemented for the GDA conduit.
|
||||
|
||||
### **rocWMMA** (2.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* More unit tests to increase the code coverage.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Increased compile timeout and improved visualization in `math-ci`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Absolute paths from the `RPATH` of sample and test binary files.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed issues caused by HIP changes:
|
||||
* Removed the `.data` member from `HIP_vector_type`.
|
||||
* Broadcast constructor now only writes to the first vector element.
|
||||
* Fixed a bug related to `int32_t` usage in `hipRTC_gemm` for gfx942, caused by breaking changes in HIP.
|
||||
* Replaced `#pragma unroll` with `static for` to fix a bug caused by the upgraded compiler which no longer supports using `#pragma unroll` with template parameter indices.
|
||||
* Corrected test predicates for `BLK` and `VW` cooperative kernels.
|
||||
* Modified `compute_utils.sh` in `build-infra` to ensure rocWMMA is built with gfx1151 target for ROCm 7.0 and beyond.
|
||||
|
||||
## ROCm 7.1.0
|
||||
|
||||
See the [ROCm 7.1.0 release notes](https://rocm.docs.amd.com/en/docs-7.1.0/about/release-notes.html#rocm-7-1-0-release-notes)
|
||||
for a complete overview of this release.
|
||||
|
||||
### **AMD SMI** (26.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `GPU LINK PORT STATUS` table to `amd-smi xgmi` command. The `amd-smi xgmi -s` or `amd-smi xgmi --source-status` will now show the `GPU LINK PORT STATUS` table.
|
||||
|
||||
* `amdsmi_get_gpu_revision()` to Python API. This function retrieves the GPU revision ID. Available in `amdsmi_interface.py` as `amdsmi_get_gpu_revision()`.
|
||||
|
||||
* Gpuboard and baseboard temperatures to `amd-smi metric` command.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Struct `amdsmi_topology_nearest_t` member `processor_list`. Member size changed, processor_list[AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP].
|
||||
|
||||
* `amd-smi reset --profile` behavior so that it won't also reset the performance level.
|
||||
* The performance level can still be reset using `amd-smi reset --perf-determinism`.
|
||||
|
||||
* Setting power cap is now available in Linux Guest. You can now use `amd-smi set --power-cap` as usual in Linux Guest systems too.
|
||||
|
||||
* Changed `amd-smi static --vbios` to `amd-smi static --ifwi`.
|
||||
* VBIOS naming is replaced with IFWI (Integrated Firmware Image) for improved clarity and consistency.
|
||||
* AMD Instinct MI300 Series GPUs (and later) now use a new version format with enhanced build information.
|
||||
* Legacy command `amd-smi static --vbios` remains functional for backward compatibility, but displays updated IFWI heading.
|
||||
* The Python, C, and Rust API for `amdsmi_get_gpu_vbios_version()` will now have a new field called `boot_firmware`, which will return the legacy vbios version number that is also known as the Unified BootLoader (UBL) version.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Optimized the way `amd-smi process` validates, which processes are running on a GPU.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`. Updated the deletion calculation to use `files_to_delete = len(folder_files) - file_limit` for exact file count management.
|
||||
|
||||
* Fixed the event monitoring segfaults causing RDC to crash. Added the mutex locking around access to device event notification file pointer.
|
||||
|
||||
* Fixed an issue where using `amd-smi ras --folder <folder_name>` was forcing the created folder's name to be lowercase. This fix also makes all string input options case-insensitive.
|
||||
|
||||
* Fixed certain output in `amd-smi monitor` when GPUs are partitioned. It fixes the issue with amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, and `amd-smi monitor -Vqt --file /tmp/test1`. These commands will now be able to display as normal in partitioned GPU scenarios.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-710) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv/bwd).
|
||||
* Support for elementwise kernel.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Non-grouped convolutions are deprecated. Their functionality is supported by grouped convolution.
|
||||
|
||||
### **HIP** (7.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* New HIP APIs
|
||||
- `hipModuleGetFunctionCount` returns the number of functions within a module
|
||||
- `hipMemsetD2D8` sets 2D memory range with specified 8-bit values
|
||||
- `hipMemsetD2D8Async` asynchronously sets 2D memory range with specified 8-bit values
|
||||
- `hipMemsetD2D16` sets 2D memory range with specified 16-bit values
|
||||
- `hipMemsetD2D16Async` asynchronously sets 2D memory range with specified 16-bit values
|
||||
- `hipMemsetD2D32` sets 2D memory range with specified 32-bit values
|
||||
- `hipMemsetD2D32Async` asynchronously sets 2D memory range with specified 32-bit values
|
||||
- `hipStreamSetAttribute` sets attributes such as synchronization policy for a given stream
|
||||
- `hipStreamGetAttribute` returns attributes such as priority for a given stream
|
||||
- `hipModuleLoadFatBinary` loads fatbin binary to a module
|
||||
- `hipMemcpyBatchAsync` asynchronously performs a batch copy of 1D or 2D memory
|
||||
- `hipMemcpy3DBatchAsync` asynchronously performs a batch copy of 3D memory
|
||||
- `hipMemcpy3DPeer` copies memory between devices
|
||||
- `hipMemcpy3DPeerAsync` asynchronously copies memory between devices
|
||||
- `hipMemsetD2D32Async` asynchronously sets 2D memory range with specified 32-bit values
|
||||
- `hipMemPrefetchAsync_v2` prefetches memory to the specified location
|
||||
- `hipMemAdvise_v2` advises about the usage of a given memory range
|
||||
- `hipGetDriverEntryPoint ` gets function pointer of a HIP API.
|
||||
- `hipSetValidDevices` sets a default list of devices that can be used by HIP
|
||||
- `hipStreamGetId` queries the id of a stream
|
||||
* Support for nested tile partitioning within cooperative groups, matching CUDA functionality.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved HIP module loading latency.
|
||||
* Optimized kernel metadata retrieval during module post-load.
|
||||
* Optimized doorbell ring in HIP runtime for the following performance improvements:
|
||||
- Makes efficient packet batching for HIP graph launch
|
||||
- Dynamic packet copying based on a defined maximum threshold or power-of-2 staggered copy pattern
|
||||
- If timestamps are not collected for a signal for reuse, it creates a new signal. This can potentially increase the signal footprint if the handler doesn't run fast enough
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* A segmentation fault occurred in the application when capturing the same HIP graph from multiple streams with cross-stream dependencies. The HIP runtime has fixed an issue where a forked stream joined to a parent stream that was not originally created with the API `hipStreamBeginCapture`.
|
||||
* Different behavior of en-queuing command on a legacy stream during stream capture on AMD ROCM platform, compared with CUDA. HIP runtime now returns an error in this specific situation to match CUDA behavior.
|
||||
* Failure of memory access fault occurred in rocm-examples test suite. When Heterogeneous Memory Management (HMM) is not supported in the driver, `hipMallocManaged` will only allocate system memory in HIP runtime.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* SPIR-V-enabled applications might encounter a segmentation fault. The problem doesn't exist when SPIR-V is disabled. The issue will be fixed in the next ROCm release.
|
||||
|
||||
### **hipBLAS** (3.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `--clients-only` build option to only build clients against a prebuilt library.
|
||||
* gfx1150, gfx1151, gfx1200, and gfx1201 support enabled.
|
||||
* FORTRAN enabled for the Microsoft Windows build and tests.
|
||||
* Additional reference library fallback options added.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Improved the build time for clients by removing `clients_common.cpp` from the hipblas-test build.
|
||||
|
||||
### **hipBLASLt** (1.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Fused Clamp GEMM for ``HIPBLASLT_EPILOGUE_CLAMP_EXT`` and ``HIPBLASLT_EPILOGUE_CLAMP_BIAS_EXT``. This feature requires the minimum (``HIPBLASLT_MATMUL_DESC_EPILOGUE_ACT_ARG0_EXT``) and maximum (``HIPBLASLT_MATMUL_DESC_EPILOGUE_ACT_ARG1_EXT``) to be set.
|
||||
* Support for ReLU/Clamp activation functions with auxiliary output for the `FP16` and `BF16` data types for gfx942 to capture intermediate results. This feature is enabled for ``HIPBLASLT_EPILOGUE_RELU_AUX``, ``HIPBLASLT_EPILOGUE_RELU_AUX_BIAS``, ``HIPBLASLT_EPILOGUE_CLAMP_AUX_EXT``, and ``HIPBLASLT_EPILOGUE_CLAMP_AUX_BIAS_EXT``.
|
||||
* Support for `HIPBLAS_COMPUTE_32F_FAST_16BF` for FP32 data type for gfx950 only.
|
||||
* CPP extension APIs ``setMaxWorkspaceBytes`` and ``getMaxWorkspaceBytes``.
|
||||
* Feature to print logs (using ``HIPBLASLT_LOG_MASK=32``) for Grouped GEMM.
|
||||
* Support for swizzleA by using the hipblaslt-ext cpp API.
|
||||
* Support for hipBLASLt extop for gfx11XX and gfx12XX.
|
||||
|
||||
#### Changed
|
||||
|
||||
* ``hipblasLtMatmul()`` now returns an error when the workspace size is insufficient, rather than causing a segmentation fault.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* `TF32` kernel optimization for the AMD Instinct MI355X GPU to enhance training and inference efficiency.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed incorrect results when using ldd and ldc dimension parameters with some solutions.
|
||||
|
||||
### **hipCUB** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Exposed Thread-level reduction API `hipcub::ThreadReduce`.
|
||||
* `::hipcub::extents`, with limited parity to C++23's `std::extents`. Only `static extents` is supported; `dynamic extents` is not. Helper structs have been created to perform computations on `::hipcub::extents` only when the backend is rocPRIM. For the CUDA backend, similar functionality exists.
|
||||
* `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_mdspan.hpp` to support `::hipcub::extents`.
|
||||
* `::hipcub::ForEachInExtents` API.
|
||||
* `hipcub::DeviceTransform::Transform` and `hipcub::DeviceTransform::TransformStableArgumentAddresses`.
|
||||
* hipCUB and its dependency rocPRIM have been moved into the new `rocm-libraries` [monorepo repository](https://github.com/ROCm/rocm-libraries). This repository contains a number of ROCm libraries that are frequently used together.
|
||||
* The repository migration requires a few changes to the way that hipCUB fetches library dependencies.
|
||||
* CMake build option `ROCPRIM_FETCH_METHOD` may be set to one of the following:
|
||||
* `PACKAGE` - (default) searches for a preinstalled packaged version of the dependency. If it is not found, the build will fall back using option `DOWNLOAD`, below.
|
||||
* `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time).
|
||||
* `MONOREPO` - this option is intended to be used if you are building hipCUB from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains rocPRIM). When selected, the build will try find the dependency in the local repository tree. If it cannot be found, the build will attempt to use git to perform a sparse-checkout of rocPRIM. If that also fails, it will fall back to using the `DOWNLOAD` option described above.
|
||||
|
||||
* A new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from installed `hipCUB` provided by the system.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Changed include headers to avoid relative includes that have slipped in.
|
||||
* Changed `CUDA_STANDARD` for tests in `test/hipcub`, due to C++17 APIs such as `std::exclusive_scan` is used in some tests. Still use `CUDA_STANDARD 14` for `test/extra`.
|
||||
* Changed `CCCL_MINIMUM_VERSION` to `2.8.2` to align with CUB.
|
||||
* Changed `cmake_minimum_required` from `3.16` to `3.18`, in order to support `CUDA_STANDARD 17` as a valid value.
|
||||
* Add support for large num_items `DeviceScan`, `DevicePartition` and `Reduce::{ArgMin, ArgMax}`.
|
||||
* Added tests for large num_items.
|
||||
* The previous dependency-related build option `DEPENDENCIES_FORCE_DOWNLOAD` has been renamed `EXTERNAL_DEPS_FORCE_DOWNLOAD` to differentiate it from the new rocPRIM dependency option described above. Its behavior remains the same - it forces non-ROCm dependencies (Google Benchmark and Google Test) to be downloaded rather than searching for installed packages. This option defaults to `OFF`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Removed `TexRefInputIterator`, which was removed from CUB after CCCL's 2.6.0 release. This API should have already been removed, but somehow it remained and was not tested.
|
||||
* Deprecated `hipcub::ConstantInputIterator`, use `rocprim::constant_iterator` or `rocthrust::constant_iterator` instead.
|
||||
* Deprecated `hipcub::CountingInputIterator`, use `rocprim::counting_iterator` or `rocthrust::counting_iterator` instead.
|
||||
* Deprecated `hipcub::DiscardOutputIterator`, use `rocprim::discard_iterator` or `rocthrust::discard_iterator` instead.
|
||||
* Deprecated `hipcub::TransformInputIterator`, use `rocprim::transform_iterator` or `rocthrust::transform_iterator` instead.
|
||||
* Deprecated `hipcub::AliasTemporaries`, which is considered to be an internal API. Moved to the detail namespace.
|
||||
* Deprecated almost all functions in `projects/hipcub/hipcub/include/hipcub/backend/rocprim/util_ptx.hpp`.
|
||||
* Deprecated hipCUB macros: `HIPCUB_MAX`, `HIPCUB_MIN`, `HIPCUB_QUOTIENT_FLOOR`, `HIPCUB_QUOTIENT_CEILING`, `HIPCUB_ROUND_UP_NEAREST` and `HIPCUB_ROUND_DOWN_NEAREST`.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* The `__half` template specializations of Simd operators are currently disabled due to possible build issues with PyTorch.
|
||||
|
||||
### **hipFFT** (1.0.21)
|
||||
|
||||
#### Added
|
||||
|
||||
* Improved test coverage of multi-stream plans, user-specified work areas, and default stride calculation.
|
||||
* Experimental introduction of hipFFTW library, interfacing rocFFT on AMD platforms using the same symbols as FFTW3 (with partial support).
|
||||
|
||||
### **hipfort** (0.7.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for building with CMake 4.0.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed a potential integer overflow issue in `hipMalloc` interfaces.
|
||||
|
||||
### **hipRAND** (3.1.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Updated error handling for several hipRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.0. As of ROCm 7.0.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
|
||||
|
||||
### **hipSOLVER** (3.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Extended test suites for `hipsolverDn` compatibility functions.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Changed code coverage to use `llvm-cov` instead of `gcov`.
|
||||
|
||||
### **hipSPARSE** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Brain half float mixed precision for the following routines:
|
||||
* `hipsparseAxpby` where X and Y use bfloat16 and result and the compute type use float.
|
||||
* `hipsparseSpVV` where X and Y use bfloat16 and result and the compute type use float.
|
||||
* `hipsparseSpMV` where A and X use bfloat16 and Y and the compute type use float.
|
||||
* `hipsparseSpMM` where A and B use bfloat16 and C and the compute type use float.
|
||||
* `hipsparseSDDMM` where A and B use bfloat16 and C and the compute type use float.
|
||||
* `hipsparseSDDMM` where A and B and C use bfloat16 and the compute type use float.
|
||||
* Half float mixed precision to `hipsparseSDDMM` where A and B and C use float16 and the compute type use float.
|
||||
* Brain half float uniform precision to `hipsparseScatter` and `hipsparseGather` routines.
|
||||
* Documentation for installing and building hipSPARSE on Microsoft Windows.
|
||||
|
||||
### **hipSPARSELt** (0.2.5)
|
||||
|
||||
#### Changed
|
||||
|
||||
* Changed the behavior of the Relu activation.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Provided more kernels for the `FP16` and `BF16` data types.
|
||||
|
||||
### **MIGraphX** (2.14.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Python 3.13 support.
|
||||
* PyTorch wheels to the Dockerfile.
|
||||
* Python API for returning serialized bytes.
|
||||
* `fixed_pad` operator for padding dynamic shapes to the maximum static shape.
|
||||
* Matcher to upcast base `Softmax` operations.
|
||||
* Support for the `convolution_backwards` operator through rocMLIR.
|
||||
* `LSE` output to attention fusion.
|
||||
* Flags to `EnableControlFlowGuard` due to BinSkim errors.
|
||||
* New environment variable documentation and reorganized structure.
|
||||
* `stash_type` attribute for `LayerNorm` and expanded test coverage.
|
||||
* Operator builders (phase 2).
|
||||
* `MIGRAPHX_GPU_HIP_FLAGS` to allow extra HIP compile flags.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated C API to include `current()` caller information in error reporting.
|
||||
* Updated documentation dependencies:
|
||||
* **rocm-docs-core** bumped from 1.21.1 → 1.25.0 across releases.
|
||||
* **Doxygen** updated to 1.14.0.
|
||||
* **urllib3** updated from 2.2.2 → 2.5.0.
|
||||
* Updated `src/CMakeLists.txt` to support `msgpack` 6.x (`msgpack-cxx`).
|
||||
* Updated model zoo test generator to fix test issues and add summary logging.
|
||||
* Updated `rocMLIR` and `ONNXRuntime` mainline references across commits.
|
||||
* Updated module sorting algorithm for improved reliability.
|
||||
* Restricted FP8 quantization to `dot` and `convolution` operators.
|
||||
* Moved ONNX Runtime launcher script into MIGraphX and updated build scripts.
|
||||
* Simplified ONNX `Resize` operator parser for correctness and maintainability.
|
||||
* Updated `any_ptr` assertion to avoid failure on default HIP stream.
|
||||
* Print kernel and module information on compile failure.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Removed Perl dependency from SLES builds.
|
||||
* Removed redundant includes and unused internal dependencies.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Reduced nested visits in reference operators to improve compile time.
|
||||
* Avoided dynamic memory allocation during kernel launches.
|
||||
* Removed redundant NOP instructions for GFX11/12 platforms.
|
||||
* Improved `Graphviz` output (node color and layout updates).
|
||||
* Optimized interdependency checking during compilation.
|
||||
* Skip hipBLASLt solutions that require a workspace size larger than 128 MB for efficient memory utilization.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Error in `MIGRAPHX_GPU_COMPILE_PARALLEL` documentation (#4337).
|
||||
* rocMLIR `rewrite_reduce` issue (#4218).
|
||||
* Bug with `invert_permutation` on GPU (#4194).
|
||||
* Compile error when `MIOPEN` is disabled (missing `std` includes) (#4281).
|
||||
* ONNX `Resize` parsing when input and output shapes are identical (#4133, #4161).
|
||||
* Issue with MHA in attention refactor (#4152).
|
||||
* Synchronization issue from upstream ONNX Runtime (#4189).
|
||||
* Spelling error in “Contiguous” (#4287).
|
||||
* Tidy complaint about duplicate header (#4245).
|
||||
* `reshape`, `transpose`, and `broadcast` rewrites between pointwise and reduce operators (#3978).
|
||||
* Extraneous include file in HIPRTC-based compilation (#4130).
|
||||
* CI Perl dependency issue for SLES builds (#4254).
|
||||
* Compiler warnings for ROCm 7.0 of ``error: unknown warning option '-Wnrvo'``(#4192).
|
||||
|
||||
### **MIOpen** (3.5.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Added a new trust verify find mode.
|
||||
* Ported Op4dTensorLite kernel from OpenCL to HIP.
|
||||
* Implemented a generic HIP kernel for backward layer normalization.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Kernel DBs moved from Git LFS to DVC (Data Version Control).
|
||||
|
||||
#### Optimized
|
||||
|
||||
* [Conv] Enabled Composable Kernel (CK) implicit gemms on gfx950.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* [BatchNorm] Fixed a bug for the NHWC layout when a variant was not applicable.
|
||||
* Fixed a bug that caused a zero-size LDS array to be defined on Navi.
|
||||
|
||||
### **MIVisionX** (3.4.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* VX_RPP - Update blur
|
||||
* HIP - HIP_CHECK for hipLaunchKernelGGL for gated launch
|
||||
|
||||
#### Changed
|
||||
|
||||
* AMD Custom V1.1.0 - OpenMP updates
|
||||
* HALF - Fix half.hpp path updates
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* AMD Custom - dependency linking errors resolved
|
||||
* VX_RPP - Fix memory leak
|
||||
* Packaging - Remove Meta Package dependency for HIP
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Installation on RedHat/SLES requires the manual installation of the `FFMPEG` & `OpenCV` dev packages.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* VX_AMD_MEDIA - rocDecode support for hardware decode
|
||||
|
||||
### **RCCL** (2.27.7)
|
||||
|
||||
#### Added
|
||||
|
||||
* `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
|
||||
* `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
|
||||
|
||||
#### Changed
|
||||
|
||||
* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
|
||||
* Compatibility with NCCL 2.27.7.
|
||||
|
||||
#### Optimized
|
||||
* Enabled and optimized batched P2P operations to improve small message performance for `AllToAll` and `AllGather`.
|
||||
* Optimized channel count selection to improve efficiency for small-to-medium message sizes in `ReduceScatter`.
|
||||
* Changed code inlining to improve latency for small message sizes for `AllReduce`, `AllGather`, and `ReduceScatter`.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work.
|
||||
* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
|
||||
|
||||
### **rocAL** (2.4.0)
|
||||
|
||||
#### Added
|
||||
* JAX iterator support in rocAL
|
||||
* rocJPEG - Fused Crop decoding support
|
||||
|
||||
#### Changed
|
||||
* CropResize - updates and fixes
|
||||
* Packaging - Remove Meta Package dependency for HIP
|
||||
|
||||
#### Resolved issues
|
||||
* OpenMP - dependency linking errors resolved.
|
||||
* Bugfix - memory leaks in rocAL.
|
||||
|
||||
#### Known issues
|
||||
* Package installation on SLES requires manually installing `TurboJPEG`.
|
||||
* Package installation on RedHat and SLES requires manually installing the `FFMPEG Dev` package.
|
||||
|
||||
### **rocALUTION** (4.0.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for gfx950.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Updated the default build standard to C++17 when compiling rocALUTION from source (previously C++14).
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved and expanded user documentation.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed a bug in the GPU hashing algorithm that occurred when not compiling with -O2/-O3.
|
||||
* Fixed an issue with the SPAI preconditioner when using complex numbers.
|
||||
|
||||
### **rocBLAS** (5.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Sample for clients using OpenMP threads calling rocBLAS functions.
|
||||
* gfx1150 and gfx1151 enabled.
|
||||
|
||||
#### Changed
|
||||
|
||||
* By default, the Tensile build is no longer based on `tensile_tag.txt` but uses the same commit from shared/tensile in the rocm-libraries repository. The rmake or install `-t` option can build from another local path with a different commit.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved the performance of Level 2 gemv transposed (`TransA != N`) for the problem sizes where `m` is small and `n` is large on gfx90a and gfx942.
|
||||
|
||||
### **ROCdbgapi** (0.77.4)
|
||||
|
||||
#### Added
|
||||
|
||||
* gfx1150 and gfx1151 enabled.
|
||||
|
||||
### **rocDecode** (1.4.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* AV1 12-bit decode support on VA-API version 1.23.0 and later.
|
||||
* rocdecode-host V1.0.0 library for software decode
|
||||
* FFmpeg version support for 5.1 and 6.1
|
||||
* Find package - rocdecode-host
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* rocdecode-host - failure to build debuginfo packages without FFmpeg resolved.
|
||||
* Fix a memory leak for rocDecodeNegativeTests
|
||||
|
||||
#### Changed
|
||||
|
||||
* HIP meta package changed - Use hip-dev/devel to bring required hip dev deps
|
||||
* rocdecode host - linking updates to rocdecode-host library
|
||||
|
||||
### **rocFFT** (1.0.35)
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Implemented single-kernel plans for some 2D problem sizes, on devices with at least 160KiB of LDS.
|
||||
* Improved performance of unit-strided, complex-interleaved, forward/inverse FFTs for lengths: (64,64,128), (64,64,52), (60,60,60)
|
||||
, (32,32,128), (32,32,64), (64,32,128)
|
||||
* Improved performance of 3D MPI pencil decompositions by using sub-communicators for global transpose operations.
|
||||
|
||||
### **rocJPEG** (1.2.0)
|
||||
|
||||
#### Changed
|
||||
* HIP meta package has been changed. Use `hip-dev/devel` to bring required hip dev deps.
|
||||
|
||||
#### Resolved issues
|
||||
* Fixed an issue where extra padding was incorrectly included when saving decoded JPEG images to files.
|
||||
* Resolved a memory leak in the jpegDecode application.
|
||||
|
||||
### **ROCm Compute Profiler** (3.3.0)
|
||||
|
||||
#### Added
|
||||
* Dynamic process attachment feature that allows coupling with a workload process, without controlling its start or end.
|
||||
* Use '--attach-pid' to specify the target process ID.
|
||||
* Use '--attach-duration-msec' to specify time duration.
|
||||
* `rocpd` choice for `--format-rocprof-output` option in profile mode.
|
||||
* `--retain-rocpd-output` option in profile mode to save large raw rocpd databases in workload directory.
|
||||
* Feature to show description of metrics during analysis.
|
||||
* Use `--include-cols Description` to show the Description column, which is excluded by default from the
|
||||
ROCm Compute Profiler CLI output.
|
||||
* `--set` filtering option in profile mode to enable single-pass counter collection for predefined subsets of metrics.
|
||||
* `--list-sets` filtering option in profile mode to list the sets available for single pass counter collection.
|
||||
* Missing counters based on register specification which enables missing metrics.
|
||||
* Enabled `SQC_DCACHE_INFLIGHT_LEVEL` counter and associated metrics.
|
||||
* Enabled `TCP_TCP_LATENCY` counter and associated counter for all GPUs except MI300.
|
||||
* Interactive metric descriptions in TUI analyze mode.
|
||||
* You can now left click on any metric cell to view detailed descriptions in the dedicated `METRIC DESCRIPTION` tab.
|
||||
* Support for analysis report output as a SQLite database using ``--output-format db`` analysis mode option.
|
||||
* `Compute Throughput` panel to TUI's `High Level Analysis` category with the following metrics: VALU FLOPs, VALU IOPs, MFMA FLOPs (F8), MFMA FLOPs (BF16), MFMA FLOPs (F16), MFMA FLOPs (F32), MFMA FLOPs (F64), MFMA FLOPs (F6F4) (in gfx950), MFMA IOPs (Int8), SALU Utilization, VALU Utilization, MFMA Utilization, VMEM Utilization, Branch Utilization, IPC
|
||||
|
||||
* `Memory Throughput` panel to TUI's `High Level Analysis` category with the following metrics: vL1D Cache BW, vL1D Cache Utilization, Theoretical LDS Bandwidth, LDS Utilization, L2 Cache BW, L2 Cache Utilization, L2-Fabric Read BW, L2-Fabric Write BW, sL1D Cache BW, L1I BW, Address Processing Unit Busy, Data-Return Busy, L1I-L2 Bandwidth, sL1D-L2 BW
|
||||
* Roofline support for Debian 12 and Azure Linux 3.0.
|
||||
* Notice for change in default output format to `rocpd` in a future release
|
||||
* This is displayed when `--format-rocprof-output rocpd` is not used in profile mode
|
||||
|
||||
#### Changed
|
||||
|
||||
* In the memory chart, long string of numbers are now displayed as scientific notation. It also solves the issue of overflow of displaying long number
|
||||
* When `--format-rocprof-output rocpd` is used, only `pmc_perf.csv` will be written to workload directory instead of multiple CSV files.
|
||||
* CLI analysis mode baseline comparison will now only compare common metrics across workloads and will not show the Metric ID.
|
||||
* Removed metrics from analysis configuration files which are explicitly marked as empty or None.
|
||||
* Changed the basic (default) view of TUI from aggregated analysis data to individual kernel analysis data.
|
||||
* Updated `Unit` of the following `Bandwidth` related metrics to `Gbps` instead of `Bytes per Normalization Unit`:
|
||||
* Theoretical Bandwidth (section 1202)
|
||||
* L1I-L2 Bandwidth (section 1303)
|
||||
* sL1D-L2 BW (section 1403)
|
||||
* Cache BW (section 1603)
|
||||
* L1-L2 BW (section 1603)
|
||||
* Read BW (section 1702)
|
||||
* Write and Atomic BW (section 1702)
|
||||
* Bandwidth (section 1703)
|
||||
* Atomic/Read/Write Bandwidth (section 1703)
|
||||
* Atomic/Read/Write Bandwidth - (HBM/PCIe/Infinity Fabric) (section 1706)
|
||||
* Updated the metric name for the following `Bandwidth` related metrics whose `Unit` is `Percent` by adding `Utilization`:
|
||||
* Theoretical Bandwidth Utilization (section 1201)
|
||||
* L1I-L2 Bandwidth Utilization (section 1301)
|
||||
* Bandwidth Utilization (section 1301)
|
||||
* Bandwidth Utilization (section 1401)
|
||||
* sL1D-L2 BW Utilization (section 1401)
|
||||
* Bandwidth Utilization (section 1601)
|
||||
* Updated `System Speed-of-Light` panel to `GPU Speed-of-Light` in TUI for the following metrics:
|
||||
* Theoretical LDS Bandwidth
|
||||
* vL1D Cache BW
|
||||
* L2 Cache BW
|
||||
* L2-Fabric Read BW
|
||||
* L2-Fabric Write BW
|
||||
* Kernel Time
|
||||
* Kernel Time (Cycles)
|
||||
* SIMD Utilization
|
||||
* Clock Rate
|
||||
* Analysis output:
|
||||
* Replaced `-o / --output` analyze mode option with `--output-format` and `--output-name`.
|
||||
* Use ``--output-format`` analysis mode option to select the output format of the analysis report.
|
||||
* Use ``--output-name`` analysis mode option to override the default file/folder name.
|
||||
* Replaced `--save-dfs` analyze mode option with `--output-format csv`.
|
||||
* Command-line options:
|
||||
* `--list-metrics` and `--config-dir` options moved to general command-line options.
|
||||
* `--list-metrics` option cannot be used without GPU architecture argument.
|
||||
* `--list-metrics` option do not show number of L2 channels.
|
||||
* `--list-available-metrics` profile mode option to display the metrics available for profiling in current GPU.
|
||||
* `--list-available-metrics` analyze mode option to display the metrics available for analysis.
|
||||
* `--block` option cannot be used with `--list-metrics` and `--list-available-metrics`options.
|
||||
* Default `rocprof` interface changed from `rocprofv3` to `rocprofiler-sdk`
|
||||
* Use ROCPROF=rocprofv3 to use rocprofv3 interface
|
||||
* Updated metric names for better alignment between analysis configuration and documentation.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Usage of `rocm-smi` in favor of `amd-smi`.
|
||||
* Hardware IP block-based filtering has been removed in favor of analysis report block-based filtering.
|
||||
* Aggregated analysis view from TUI analyze mode.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved `--time-unit` option in analyze mode to apply time unit conversion across all analysis sections, not just kernel top stats.
|
||||
* Improved logic to obtain rocprof-supported counters, which prevents unnecessary warnings.
|
||||
* Improved post-analysis runtime performance by caching and multi-processing.
|
||||
* Improve analysis block based filtering to accept metric ID level filtering.
|
||||
* This can be used to collect individual metrics from various sections of the analysis config.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed an issue of not detecting the memory clock when using `amd-smi`.
|
||||
* Fixed standalone GUI crashing.
|
||||
* Fixed L2 read/write/atomic bandwidths on AMD Instinct MI350 Series GPUs.
|
||||
* Fixed an issue where accumulation counters could not be collected on AMD Instinct MI100.
|
||||
* Fixed an issue of kernel filtering not working in the roofline chart.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* MI300A/X L2-Fabric 64B read counter may display negative values - The rocprof-compute metric 17.6.1 (Read 64B) can report negative values due to incorrect calculation when TCC_BUBBLE_sum + TCC_EA0_RDREQ_32B_sum exceeds TCC_EA0_RDREQ_sum.
|
||||
* A workaround has been implemented using max(0, calculated_value) to prevent negative display values while the root cause is under investigation.
|
||||
* The profile mode crashes when `--format-rocprof-output json` is selected.
|
||||
* As a workaround, this option should either not be provided or should be set to `csv` instead of `json`. This issue does not affect the profiling results since both `csv` and `json` output formats lead to the same profiling data.
|
||||
|
||||
### **ROCm Data Center Tool** (1.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- CPU monitoring support with 30+ CPU field definitions through AMD SMI integration.
|
||||
- CPU partition format support (c0.0, c1.0) for monitoring AMD EPYC processors.
|
||||
- Mixed GPU/CPU monitoring in single `rdci dmon` command.
|
||||
|
||||
#### Optimized
|
||||
|
||||
- Improved profiler metrics path detection for counter definitions.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Group management issues with listing created/non-created groups.
|
||||
- ECC_UNCORRECT field behavior.
|
||||
|
||||
### **ROCm Debugger (ROCgdb)** (16.3)
|
||||
|
||||
#### Added
|
||||
|
||||
* gfx1150 and gfx1151 support enabled.
|
||||
|
||||
### **ROCm Systems Profiler** (1.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- ``ROCPROFSYS_ROCM_GROUP_BY_QUEUE`` configuration setting to allow grouping of events by hardware queue, instead of the default grouping.
|
||||
- Support for `rocpd` database output with the `ROCPROFSYS_USE_ROCPD` configuration setting.
|
||||
- Support for profiling PyTorch workloads using the `rocpd` output database.
|
||||
- Support for tracing OpenMP API in Fortran applications.
|
||||
- An error warning is triggered if the profiler application fails because SELinux enforcement is enabled. The warning includes steps to disable SELinux enforcement.
|
||||
|
||||
#### Changed
|
||||
|
||||
- Updated the grouping of "kernel dispatch" and "memory copy" events in Perfetto traces. They are now grouped together by HIP Stream rather than separately and by hardware queue.
|
||||
- Updated PAPI module to v7.2.0b2.
|
||||
- ROCprofiler-SDK is now used for tracing OMPT API calls.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Profiling PyTorch and other AI workloads might fail because it is unable to find the libraries in the default linker path. As a workaround, you need to explicitly add the library path to ``LD_LIBRARY_PATH``. For example, when using PyTorch with Python 3.10, add the following to the environment:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=:/opt/venv/lib/python3.10/site-packages/torch/lib:$LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
### **rocPRIM** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* `get_sreg_lanemask_lt`, `get_sreg_lanemask_le`, `get_sreg_lanemask_gt` and `get_sreg_lanemask_ge`.
|
||||
* `rocprim::transform_output_iterator` and `rocprim::make_transform_output_iterator`.
|
||||
* Experimental support for SPIR-V, to use the correct tuned config for part of the appliable algorithms.
|
||||
* A new cmake option, `BUILD_OFFLOAD_COMPRESS`. When rocPRIM is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default.
|
||||
* A new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system.
|
||||
* `rocprim::apply` which applies a function to a `rocprim::tuple`.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Changed tests to support `ptr-to-const` output in `/test/rocprim/test_device_batch_memcpy.cpp`.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved performance of many algorithms by updating their tuned configs.
|
||||
* 891 specializations have been improved.
|
||||
* 399 specializations have been added.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed `device_select`, `device_merge`, and `device_merge_sort` not allocating the correct amount of virtual shared memory on the host.
|
||||
* Fixed the `->` operator for the `transform_iterator`, the `texture_cache_iterator`, and the `arg_index_iterator`, by now returning a proxy pointer.
|
||||
* The `arg_index_iterator` also now only returns the internal iterator for the `->`.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Deprecated the `->` operator for the `zip_iterator`.
|
||||
|
||||
### **ROCProfiler** (2.0.0)
|
||||
|
||||
#### Removed
|
||||
|
||||
* `rocprofv2` doesn't support gfx12XX Series GPUs. For gfx12XX Series GPUs, use `rocprofv3` tool.
|
||||
|
||||
### **ROCprofiler-SDK** (1.0.0)
|
||||
|
||||
#### Added
|
||||
* Dynamic process attachment- ROCprofiler-SDK and `rocprofv3` now facilitate dynamic profiling of a running GPU application by attaching to its process ID (PID), rather than launching the application through the profiler itself.
|
||||
* Scratch-memory trace information to the Perfetto output in `rocprofv3`.
|
||||
* New capabilities to the thread trace support in `rocprofv3`:
|
||||
* Real-time clock support for thread trace alignment on gfx9XX architecture. This enables high-resolution clock computation and better synchronization across shader engines.
|
||||
* `MultiKernelDispatch` thread trace support is now available across all ASICs.
|
||||
* Documentation for dynamic process attachment.
|
||||
* Documentation for `rocpd` summaries.
|
||||
|
||||
#### Optimized
|
||||
* Improved the stability and robustness of the `rocpd` output.
|
||||
|
||||
### **rocPyDecode** (0.7.0)
|
||||
|
||||
#### Added
|
||||
* rocPyJpegPerfSample - samples for JPEG decode
|
||||
|
||||
#### Changed
|
||||
* Package - rocjpeg set as required dependency.
|
||||
* rocDecode host - rocdecode host linking updates
|
||||
|
||||
#### Resolved issues
|
||||
* rocJPEG Bindings - bug fixes
|
||||
* Test package - find dependencies updated
|
||||
|
||||
### **rocRAND** (4.1.0)
|
||||
|
||||
#### Changed
|
||||
|
||||
* Changed the `USE_DEVICE_DISPATCH` flag so it can turn device dispatch off by setting it to zero. Device dispatch should be turned off when building for SPIRV.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Updated error handling for several rocRAND unit tests to accommodate the new `hipGetLastError` behavior that was introduced in ROCm 7.0.
|
||||
As of ROCm 7.0, the internal error state is cleared on each call to `hipGetLastError` rather than on every HIP API call.
|
||||
|
||||
### **rocSOLVER** (3.31.0)
|
||||
|
||||
#### Optimized
|
||||
|
||||
Improved the performance of:
|
||||
|
||||
* LARF, LARFT, GEQR2, and downstream functions such as GEQRF.
|
||||
* STEDC and divide and conquer Eigensolvers.
|
||||
|
||||
### **rocSPARSE** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Brain half float mixed precision for the following routines:
|
||||
* `rocsparse_axpby` where X and Y use bfloat16 and result and the compute type use float.
|
||||
* `rocsparse_spvv` where X and Y use bfloat16 and result and the compute type use float.
|
||||
* `rocsparse_spmv` where A and X use bfloat16 and Y and the compute type use float.
|
||||
* `rocsparse_spmm` where A and B use bfloat16 and C and the compute type use float.
|
||||
* `rocsparse_sddmm` where A and B use bfloat16 and C and the compute type use float.
|
||||
* `rocsparse_sddmm` where A and B and C use bfloat16 and the compute type use float.
|
||||
* Half float mixed precision to `rocsparse_sddmm` where A and B and C use float16 and the compute type use float.
|
||||
* Brain half float uniform precision to `rocsparse_scatter` and `rocsparse_gather` routines.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved the user documentation.
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Deprecate trace, debug, and bench logging using the environment variable `ROCSPARSE_LAYER`.
|
||||
|
||||
### **rocThrust** (4.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* A new CMake option `-DSQLITE_USE_SYSTEM_PACKAGE` to allow SQLite to be provided by the system.
|
||||
* Introduced `libhipcxx` as a soft dependency. When `libhipcxx` can be included, rocThrust can use structs and methods defined in `libhipcxx`. This allows for a more complete behavior parity with CCCL and mirrors CCCL's thrust own dependency on `libcudacxx`.
|
||||
* Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system.
|
||||
|
||||
#### Changed
|
||||
|
||||
* The previously hidden cmake build option `FORCE_DEPENDENCIES_DOWNLOAD` has been unhidden and renamed `EXTERNAL_DEPS_FORCE_DOWNLOAD` to differentiate it from the new rocPRIM and rocRAND dependency options described above. Its behavior remains the same - it forces non-ROCm dependencies (Google Benchmark, Google Test, and SQLite) to be downloaded instead of searching for existing installed packages. This option defaults to `OFF`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* The previous dependency-related build options `DOWNLOAD_ROCPRIM` and `DOWNLOAD_ROCRAND` have been removed. Use `ROCPRIM_FETCH_METHOD=DOWNLOAD` and `ROCRAND_FETCH_METHOD=DOWNLOAD` instead.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* `event` test is failing on CI and local runs on MI300, MI250 and MI210.
|
||||
|
||||
* rocThrust, as well as its dependencies rocPRIM and rocRAND have been moved into the new `rocm-libraries` monorepo repository (https://github.com/ROCm/rocm-libraries). This repository contains several ROCm libraries that are frequently used together.
|
||||
* The repository migration requires a few changes to the way that rocThrust's ROCm library dependencies are fetched.
|
||||
* There are new cmake options for obtaining rocPRIM and (optionally, if BUILD_BENCHMARKS is enabled) rocRAND.
|
||||
* cmake build options `ROCPRIM_FETCH_METHOD` and `ROCRAND_FETCH_METHOD` may be set to one of the following:
|
||||
* `PACKAGE` - (default) searches for a preinstalled packaged version of the dependency. If it's not found, the build will fall back using option `DOWNLOAD`, described below.
|
||||
* `DOWNLOAD` - downloads the dependency from the rocm-libraries repository. If git >= 2.25 is present, this option uses a sparse checkout that avoids downloading more than it needs to. If not, the whole monorepo is downloaded (this may take some time).
|
||||
* `MONOREPO` - this option is intended to be used if you are building rocThrust from within a copy of the rocm-libraries repository that you have cloned (and therefore already contains the dependencies rocPRIM and rocRAND). When selected, the build will try to find the dependency in the local repository tree. If it can't be found, the build will attempt to add it to the local tree using a sparse-checkout. If that also fails, it will fall back to using the `DOWNLOAD` option.
|
||||
|
||||
### **RPP** (2.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Solarize augmentation for HOST and HIP.
|
||||
* Hue and Saturation adjustment augmentations for HOST and HIP.
|
||||
* Find RPP - cmake module.
|
||||
* Posterize augmentation for HOST and HIP.
|
||||
|
||||
#### Changed
|
||||
|
||||
* HALF - Fix `half.hpp` path updates.
|
||||
* Box filter - padding updates.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Packaging - Removed Meta Package dependency for HIP.
|
||||
* SLES 15 SP6 support.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Test Suite - Fixes for accuracy.
|
||||
* HIP Backend - Check return status warning fixes.
|
||||
* Bug fix - HIP vector types init.
|
||||
|
||||
## ROCm 7.0.2
|
||||
|
||||
See the [ROCm 7.0.2 release notes](https://rocm.docs.amd.com/en/docs-7.0.2/about/release-notes.html#rocm-7-0-2-release-notes)
|
||||
@@ -1236,6 +265,10 @@ for a complete overview of this release.
|
||||
|
||||
- `amd-smi monitor` on Linux Guest systems triggers an attribute error.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.0/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Added
|
||||
@@ -2552,7 +1585,7 @@ The previous default accumulator types could lead to situations in which unexpec
|
||||
|
||||
#### Added
|
||||
|
||||
* Hybrid computation support for existing STEQR routines.
|
||||
* Hybrid computation support for existing routines: STEQR
|
||||
|
||||
#### Optimized
|
||||
|
||||
|
||||
745
RELEASE.md
745
RELEASE.md
File diff suppressed because it is too large
Load Diff
26
default.xml
26
default.xml
@@ -1,17 +1,33 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.1"
|
||||
<default revision="refs/tags/rocm-7.0.2"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="ROCR-Runtime" />
|
||||
<project name="amdsmi" />
|
||||
<project name="aqlprofile" />
|
||||
<project name="rdc" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm_smi_lib" />
|
||||
<project name="rocm-core" />
|
||||
<project name="rocm-examples" />
|
||||
<project name="rocminfo" />
|
||||
<project name="rocprofiler" />
|
||||
<project name="rocprofiler-register" />
|
||||
<project name="rocprofiler-sdk" />
|
||||
<project name="rocprofiler-compute" />
|
||||
<project name="rocprofiler-systems" />
|
||||
<project name="roctracer" />
|
||||
<!--HIP Projects-->
|
||||
<project name="hip" />
|
||||
<project name="hip-tests" />
|
||||
<project name="HIPIFY" />
|
||||
<project name="clr" />
|
||||
<project name="hipother" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
@@ -25,7 +41,6 @@
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
@@ -39,14 +54,7 @@
|
||||
MIOpen rocBLAS rocFFT rocPRIM rocRAND
|
||||
rocSPARSE rocThrust Tensile -->
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<!-- The following components have been migrated to rocm-systems:
|
||||
aqlprofile clr hip hip-tests hipother
|
||||
rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute
|
||||
rocprofiler-register rocprofiler-sdk rocprofiler-systems
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
|
||||
@@ -25,69 +25,70 @@ additional licenses. Please review individual repositories for more information.
|
||||
<!-- spellcheck-disable -->
|
||||
| Component | License |
|
||||
|:---------------------|:-------------------------|
|
||||
| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/LICENSE.md) |
|
||||
| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENSE.txt) |
|
||||
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
|
||||
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
|
||||
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
|
||||
| [AQLprofile](https://github.com/ROCm/rocm-systems/tree/develop/projects/aqlprofile/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/aqlprofile/LICENSE.md) |
|
||||
| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
|
||||
| [Cluster Validation Suite](https://github.com/ROCm/cvs) | [MIT](https://github.com/ROCm/cvs/blob/main/LICENSE) |
|
||||
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
|
||||
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
|
||||
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
|
||||
| [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/hip/LICENSE.md) |
|
||||
| [hipamd](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/hipamd/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/hipamd/LICENSE.md) |
|
||||
| [hipBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblas/LICENSE.md) |
|
||||
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
||||
| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/amd-staging/LICENSE.txt) |
|
||||
| [hipamd](https://github.com/ROCm/clr/tree/amd-staging/hipamd) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/hipamd/LICENSE.txt) |
|
||||
| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
|
||||
| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
|
||||
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
||||
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
||||
| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
|
||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
||||
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
||||
| [hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiprand/LICENSE.md) |
|
||||
| [hipSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsolver/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparse/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparselt/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiptensor/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiptensor/LICENSE) |
|
||||
| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
|
||||
| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
|
||||
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
|
||||
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
|
||||
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
||||
| [MIOpen](https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
|
||||
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
|
||||
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocblas/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
|
||||
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
|
||||
| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
|
||||
| [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/LICENSE.md) |
|
||||
| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
|
||||
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v3.0](https://github.com/ROCm/ROCgdb/blob/amd-staging/COPYING3) |
|
||||
| [rocJPEG](https://github.com/ROCm/rocJPEG/) | [MIT](https://github.com/ROCm/rocJPEG/blob/develop/LICENSE) |
|
||||
| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
|
||||
| [rocminfo](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocminfo/License.txt) |
|
||||
| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
|
||||
| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [MIT](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
|
||||
| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
|
||||
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
||||
| [ROCm-Core](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-core/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-core/LICENSE.md) |
|
||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-compute/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute/LICENSE.md) |
|
||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rocm-systems/tree/develop/projects/rdc/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rdc/LICENSE.md) |
|
||||
| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
|
||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
|
||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/opencl/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/opencl/LICENSE.md) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
|
||||
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
|
||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-smi-lib/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-smi-lib/LICENSE.md) |
|
||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-systems/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-systems/LICENSE.md) |
|
||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocprim/LICENSE.md) |
|
||||
| [ROCProfiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler/LICENSE.md) |
|
||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-sdk/LICENSE.md) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
|
||||
| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
|
||||
| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
|
||||
| [rocRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocrand/LICENSE.md) |
|
||||
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
|
||||
| [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocr-runtime/LICENSE.txt) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/amd-staging/LICENSE.txt) |
|
||||
| [rocSHMEM](https://github.com/ROCm/rocSHMEM/) | [MIT](https://github.com/ROCm/rocSHMEM/blob/develop/LICENSE.md) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver/) | [BSD-2-Clause](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsolver/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsparse/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust/) | [Apache 2.0](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocthrust/LICENSE) |
|
||||
| [ROCTracer](https://github.com/ROCm/rocm-systems/tree/develop/projects/roctracer/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/roctracer/LICENSE.md) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocwmma/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocwmma/LICENSE.md) |
|
||||
| [Tensile](https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/shared/tensile/LICENSE.md) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
|
||||
| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
|
||||
| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
|
||||
| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
|
||||
|
||||
Open sourced ROCm components are released via public GitHub
|
||||
|
||||
@@ -1,137 +1,137 @@
|
||||
ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
|
||||
,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
|
||||
,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.0 [#rhel-10-702-past-60]_, 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7 [#sles-db-700-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
|
||||
,"Debian 13 [#db-mi300x-past-60]_, 12 [#sles-db-700-past-60]_",Debian 12 [#sles-db-700-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
|
||||
,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
|
||||
,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-past-60]_,gfx950 [#mi350x-os-past-60]_,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
|
||||
,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908 [#mi100-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
||||
|
@@ -12,7 +12,7 @@ You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-
|
||||
|
||||
GPUs listed in the following table support compute workloads (no display
|
||||
information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
|
||||
workloads, see the :doc:`Use ROCm on Radeon and Ryzen <radeon:index>` to verify
|
||||
workloads, see the :docs:`Use ROCm on Radeon and Ryzen <radeon:index.html>` to verify
|
||||
compatibility and system requirements.
|
||||
|
||||
.. |br| raw:: html
|
||||
@@ -22,18 +22,18 @@ compatibility and system requirements.
|
||||
.. container:: format-big-table
|
||||
|
||||
.. csv-table::
|
||||
:header: "ROCm Version", "7.1.1", "7.1.0", "6.4.0"
|
||||
:header: "ROCm Version", "7.0.2", "7.0.1/7.0.0", "6.4.0"
|
||||
:stub-columns: 1
|
||||
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
|
||||
,"RHEL 10.1, 10.0, 9.7, |br| 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
|
||||
,"Debian 13, 12","Debian 13, 12",Debian 12
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0
|
||||
,Rocky Linux 9,Rocky Linux 9,
|
||||
,"RHEL 10.0 [#rhel-10-702]_, 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700]_,RHEL 8.10
|
||||
,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_"
|
||||
,"Debian 13 [#db-mi300x]_, 12 [#sles-db-700]_",Debian 12 [#sles-db-700]_,Debian 12 [#single-node]_
|
||||
,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_
|
||||
,Rocky Linux 9 [#rl-700]_,Rocky Linux 9 [#rl-700]_,
|
||||
,.. _architecture-support-compatibility-matrix:,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
|
||||
,CDNA3,CDNA3,CDNA3
|
||||
@@ -43,22 +43,22 @@ compatibility and system requirements.
|
||||
,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix:,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
|
||||
,gfx1201,gfx1201,
|
||||
,gfx1200,gfx1200,
|
||||
,gfx1101,gfx1101,
|
||||
,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942
|
||||
,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os]_,gfx950 [#mi350x-os]_,
|
||||
,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS-700]_,
|
||||
,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS-700]_,
|
||||
,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,
|
||||
,gfx1100 [#RDNA-OS-700]_,gfx1100 [#RDNA-OS-700]_,gfx1100
|
||||
,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030
|
||||
,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942
|
||||
,gfx90a [#mi200x-os]_,gfx90a [#mi200x-os]_,gfx90a
|
||||
,gfx908 [#mi100-os]_,gfx908 [#mi100-os]_,gfx908
|
||||
,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,b6356,b5997
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
|
||||
,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
|
||||
@@ -66,76 +66,77 @@ compatibility and system requirements.
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
|
||||
,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
|
||||
Thrust,2.8.5,2.8.5,2.5.0
|
||||
CUB,2.8.5,2.8.5,2.5.0
|
||||
Thrust,2.6.0,2.6.0,2.5.0
|
||||
CUB,2.6.0,2.6.0,2.5.0
|
||||
,,,
|
||||
DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch]_, 30.10, |br| 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,1.9.10
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
|
||||
:doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,6.4.0
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
|
||||
:doc:`Cluster Validation Suite <cvs:index>`,1.0.0,1.0.0,N/A
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60400
|
||||
,,,
|
||||
DEVELOPMENT TOOLS,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.2
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
|
||||
@@ -143,27 +144,45 @@ compatibility and system requirements.
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43482
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`_.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`_.
|
||||
.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#rhel-10-702] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
|
||||
.. [#rhel-94-702] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
|
||||
.. [#rhel-700] RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#sles-db-700] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
|
||||
.. [#rl-700] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
|
||||
.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
|
||||
.. [#mi350x-os] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
|
||||
.. [#RDNA-OS-700] **For ROCm 7.0.x** - AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, and RHEL 9.6.
|
||||
.. [#rd-v710] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
|
||||
.. [#rd-v620] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) GPUs are supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
|
||||
.. [#mi325x-os] **For ROCm 7.0.x** - AMD Instinct MI325X GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#mi300x-os] **For ROCm 7.0.x** - AMD Instinct MI300X GPUs (gfx942) are supported on all listed :ref:`supported_distributions`.
|
||||
.. [#mi300A-os] **For ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
|
||||
.. [#mi200x-os] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
|
||||
.. [#mi100-os] **For ROCm 7.0.x** - AMD Instinct MI100 GPUs (gfx908) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
|
||||
.. [#tf-mi350] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#dgl_compat] DGL is supported only on ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
|
||||
.. _OS-kernel-versions:
|
||||
|
||||
Operating systems, kernel and Glibc versions
|
||||
@@ -182,11 +201,9 @@ Use this lookup table to confirm which operating system and kernel versions are
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.1, 6.12.0-124, 2.39
|
||||
,10.0, 6.12.0-55, 2.39
|
||||
`Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.0, 6.12.0-55, 2.39
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.7, 5.14.0-611, 2.34
|
||||
,9.6, 5.14.0-570, 2.34
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
|
||||
,9.5, 5.14+, 2.34
|
||||
,9.4, 5.14.0-427, 2.34
|
||||
,,
|
||||
@@ -238,18 +255,46 @@ Expand for full historical view of:
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
|
||||
.. [#rhel-10-702-past-60] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
|
||||
.. [#rhel-94-702-past-60] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
|
||||
.. [#rhel-700-past-60] **For ROCm 7.0.x** - RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#sles-db-700-past-60] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
|
||||
.. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
|
||||
.. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#rl-700-past-60] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
|
||||
.. [#mi350x-os-past-60] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
|
||||
.. [#RDNA-OS-700-past-60] **For ROCm 7.0.x** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
|
||||
.. [#RDNA-OS-past-60] **Prior ROCm 7.0.0** - Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#rd-v710-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
|
||||
.. [#rd-v620-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) is supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
|
||||
.. [#mi325x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI325X GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#mi300x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300X GPU (gfx942) is supported on all listed :ref:`supported_distributions`.
|
||||
.. [#mi300A-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300A GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
|
||||
.. [#mi200x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
|
||||
.. [#mi100-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI100 GPU (gfx908) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
|
||||
.. [#7700XT-OS-past-60] **Prior to ROCm 7.0.0** - Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
|
||||
.. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4.
|
||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
|
||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
|
||||
.. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
|
||||
.. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
|
||||
.. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
|
||||
.. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Deep Graph Library (DGL) compatibility
|
||||
:keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility
|
||||
:keywords: GPU, DGL compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,42 +10,24 @@
|
||||
DGL compatibility
|
||||
********************************************************************************
|
||||
|
||||
Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable
|
||||
Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable
|
||||
Python package for deep learning on graphs. DGL is framework agnostic, meaning
|
||||
that if a deep graph model is a component in an end-to-end application, the rest of
|
||||
if a deep graph model is a component in an end-to-end application, the rest of
|
||||
the logic is implemented using PyTorch.
|
||||
|
||||
DGL provides a high-performance graph object that can reside on either CPUs or GPUs.
|
||||
It bundles structural data features for better control and provides a variety of functions
|
||||
for computing with graph objects, including efficient and customizable message passing
|
||||
primitives for Graph Neural Networks.
|
||||
* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
|
||||
* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
to install and get started.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl
|
||||
<https://github.com/ROCm/dgl>`__ repository, which differs from the
|
||||
`https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.
|
||||
|
||||
- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`,
|
||||
which include ROCm, DGL, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
DGL is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__,
|
||||
`ROCm 6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__, and `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
|
||||
- **Partially Supported**: TF32 with AMD Instinct MI250X
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI250X
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
@@ -53,42 +35,23 @@ Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
|
||||
GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
- 1D (Temporal) and 2D (Image) Classification
|
||||
- Drug Discovery
|
||||
|
||||
For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
|
||||
Multiple use cases of DGL have been tested and verified.
|
||||
However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
|
||||
Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_,
|
||||
where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs.
|
||||
|
||||
* Although multiple use cases of DGL have been tested and verified, a few have been
|
||||
outlined in the `DGL in the Real World: Running GNNs on Real Use Cases
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog
|
||||
post, which walks through four real-world graph neural network (GNN) workloads
|
||||
implemented with the Deep Graph Library on ROCm. It covers tasks ranging from
|
||||
heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph
|
||||
regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use
|
||||
case, the authors detail: the dataset and task, how DGL is used, and their experience
|
||||
porting to ROCm. It is shown that DGL codebases often run without modification, with
|
||||
seamless integration of graph operations, message passing, sampling, and convolution.
|
||||
Coverage includes:
|
||||
|
||||
* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__
|
||||
blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform,
|
||||
bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges
|
||||
the gap between dense tensor frameworks and the irregular nature of graph data through a
|
||||
graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and
|
||||
interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration
|
||||
enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers
|
||||
and open-source repositories. This marks a major step in AMD's mission to advance open,
|
||||
scalable AI ecosystems beyond traditional architectures.
|
||||
- Single-GPU training/inference
|
||||
- Multi-GPU training
|
||||
|
||||
You can pre-process datasets and begin training on AMD GPUs through:
|
||||
|
||||
* Single-GPU training/inference
|
||||
* Multi-GPU training
|
||||
|
||||
.. _dgl-docker-compat:
|
||||
|
||||
@@ -99,17 +62,16 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest available DGL version from the official Docker Hub.
|
||||
AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
|
||||
Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
.. list-table:: DGL Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
* - Docker
|
||||
- DGL
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
@@ -117,195 +79,130 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.8.0/images/sha256-943698ddf54c22a7bcad2e5b4ff467752e29e4ba6d0c926789ae7b242cbd92dd"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-b2ec286a035eb7d0a6aab069561914d21a3cac462281e9c024501ba5ccedfbf7"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu22.04_py3.10_pytorch_2.7.1/images/sha256-d27aee16df922ccf0bcd9107bfcb6d20d34235445d456c637e33ca6f19d11a51"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm6.4.3_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-f3ba6a3c9ec9f6c1cde28449dc9780e0c4c16c4140f4b23f158565fbfd422d6b"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
|
||||
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
|
||||
- `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
|
||||
|
||||
Key ROCm libraries for DGL
|
||||
================================================================================
|
||||
|
||||
DGL on ROCm depends on specific libraries that affect its features and performance.
|
||||
Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
|
||||
Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
|
||||
If you prefer to build it yourself, ensure the following dependencies are installed:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- ROCm 7.0.0 Version
|
||||
- ROCm 6.4.x Version
|
||||
- Version
|
||||
- Purpose
|
||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
||||
- 1.1.0
|
||||
- 1.1.0
|
||||
- :version-ref:`"Composable Kernel" rocm_version`
|
||||
- Enables faster execution of core operations like matrix multiplication
|
||||
(GEMM), convolutions and transformations.
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
||||
- 3.0.0
|
||||
- 2.4.0
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
||||
- 1.0.0
|
||||
- 0.12.0
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
||||
- 4.0.0
|
||||
- 3.4.0
|
||||
- :version-ref:`hipCUB rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms for reduction,
|
||||
scan, sort and select.
|
||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
||||
- 1.0.20
|
||||
- 1.0.18
|
||||
- :version-ref:`hipFFT rocm_version`
|
||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
||||
- 3.0.0
|
||||
- 2.12.0
|
||||
- :version-ref:`hipRAND rocm_version`
|
||||
- Provides fast random number generation for GPUs.
|
||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
||||
- 3.0.0
|
||||
- 2.4.0
|
||||
- :version-ref:`hipSOLVER rocm_version`
|
||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||
singular value decompositions (SVD).
|
||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
||||
- 4.0.1
|
||||
- 3.2.0
|
||||
- :version-ref:`hipSPARSE rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
||||
- 0.2.4
|
||||
- 0.2.3
|
||||
- :version-ref:`hipSPARSELt rocm_version`
|
||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||
or matrix-matrix products.
|
||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
||||
- 2.0.0
|
||||
- 1.5.0
|
||||
- :version-ref:`hipTensor rocm_version`
|
||||
- Optimizes for high-performance tensor operations, such as contractions.
|
||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
||||
- 3.5.0
|
||||
- 3.4.0
|
||||
- :version-ref:`MIOpen rocm_version`
|
||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||
normalization, and activation functions.
|
||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
||||
- 2.13.0
|
||||
- 2.12.0
|
||||
- :version-ref:`MIGraphX rocm_version`
|
||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||
and enable Ahead-of-Time (AOT) Compilation.
|
||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
||||
- 3.3.0
|
||||
- 3.2.0
|
||||
- :version-ref:`MIVisionX rocm_version`
|
||||
- Optimizes acceleration for computer vision and AI workloads like
|
||||
preprocessing, augmentation, and inferencing.
|
||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
||||
- 3.3.0
|
||||
- 2.2.0
|
||||
- :version-ref:`rocAL rocm_version`
|
||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||
augmentation tasks. rocAL is part of MIVisionX.
|
||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
||||
- 2.26.6
|
||||
- 2.22.3
|
||||
- :version-ref:`RCCL rocm_version`
|
||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||
Broadcast.
|
||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
||||
- 1.0.0
|
||||
- 0.10.0
|
||||
- :version-ref:`rocDecode rocm_version`
|
||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||
for image, video, and other dataset formats.
|
||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
||||
- 1.1.0
|
||||
- 0.8.0
|
||||
- :version-ref:`rocJPEG rocm_version`
|
||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
||||
- 2.0.0
|
||||
- 1.9.10
|
||||
- :version-ref:`RPP rocm_version`
|
||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- 4.0.0
|
||||
- 3.3.0
|
||||
- :version-ref:`rocThrust rocm_version`
|
||||
- Provides a C++ template library for parallel algorithms like sorting,
|
||||
reduction, and scanning.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- 2.0.0
|
||||
- 1.7.0
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
@@ -314,14 +211,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal
|
||||
Supported features
|
||||
================================================================================
|
||||
|
||||
Many functions and methods available upstream are also supported in DGL on ROCm.
|
||||
Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
|
||||
Instead of listing them all, support is grouped into the following categories to provide a general overview.
|
||||
|
||||
* DGL Base
|
||||
* DGL Backend
|
||||
* DGL Data
|
||||
* DGL Dataloading
|
||||
* DGL Graph
|
||||
* DGL DGLGraph
|
||||
* DGL Function
|
||||
* DGL Ops
|
||||
* DGL Sampling
|
||||
@@ -333,29 +230,26 @@ Instead of listing them all, support is grouped into the following categories to
|
||||
* DGL NN
|
||||
* DGL Optim
|
||||
* DGL Sparse
|
||||
* GraphBolt
|
||||
|
||||
|
||||
Unsupported features
|
||||
================================================================================
|
||||
|
||||
* TF32 Support (only supported for PyTorch 2.7 and above)
|
||||
* Kineto/ROCTracer integration
|
||||
* Graphbolt
|
||||
* Partial TF32 Support (MI250x only)
|
||||
* Kineto/ ROCTracer integration
|
||||
|
||||
|
||||
Unsupported functions
|
||||
================================================================================
|
||||
|
||||
* ``bfs``
|
||||
* ``more_nnz``
|
||||
* ``format``
|
||||
* ``multiprocess_sparse_adam_state_dict``
|
||||
* ``record_stream_ndarray``
|
||||
* ``half_spmm``
|
||||
* ``segment_mm``
|
||||
* ``gather_mm_idx_b``
|
||||
* ``pgexplainer``
|
||||
* ``sample_labors_prob``
|
||||
* ``sample_labors_noprob``
|
||||
* ``sparse_admin``
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
|
||||
of the ``ROCm/dgl`` Docker image.
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: FlashInfer compatibility
|
||||
:keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility
|
||||
:description: FlashInfer deep learning framework compatibility
|
||||
:keywords: GPU, LLM, FlashInfer, compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -11,7 +11,7 @@ FlashInfer compatibility
|
||||
********************************************************************************
|
||||
|
||||
`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator
|
||||
for Large Language Models (LLMs) that provides a high-performance implementation of graphics
|
||||
for Large Language Models (LLMs) that provides high-performance implementation of graphics
|
||||
processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well
|
||||
as advanced performance across diverse scenarios.
|
||||
|
||||
@@ -25,30 +25,28 @@ offers high-performance LLM-specific operators, with easy integration through Py
|
||||
For the latest feature compatibility matrix, refer to the ``README`` of the
|
||||
`https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
Support for the ROCm port of FlashInfer is available as follows:
|
||||
|
||||
- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer
|
||||
<https://github.com/ROCm/flashinfer>`__ repository, which differs from the
|
||||
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__
|
||||
- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer
|
||||
<https://github.com/ROCm/flashinfer>`__ repository. This location differs from the
|
||||
`https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_
|
||||
upstream repository.
|
||||
|
||||
- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`,
|
||||
which include ROCm, FlashInfer, and all required dependencies.
|
||||
- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`,
|
||||
which includes ROCm, FlashInfer, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>`
|
||||
for installation and setup instructions.
|
||||
to install and get started.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
for additional context.
|
||||
- See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
in the upstream FlashInfer documentation.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. note::
|
||||
|
||||
FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
Flashinfer is supported on ROCm 6.4.1.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
|
||||
@@ -80,9 +78,10 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available FlashInfer version from the official Docker Hub.
|
||||
AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the FlashInfer version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: JAX compatibility
|
||||
:keywords: GPU, JAX, deep learning, framework compatibility
|
||||
:keywords: GPU, JAX compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,58 +10,42 @@
|
||||
JAX compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library
|
||||
for array-oriented numerical computation (similar to NumPy), with automatic differentiation
|
||||
and just-in-time (JIT) compilation to enable high-performance machine learning research.
|
||||
JAX provides a NumPy-like API, which combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale.
|
||||
|
||||
JAX provides an API that combines automatic differentiation and the
|
||||
Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
|
||||
learning at scale. JAX uses composable transformations of Python and NumPy through
|
||||
JIT compilation, automatic vectorization, and parallelization.
|
||||
JAX uses composable transformations of Python and NumPy through just-in-time
|
||||
(JIT) compilation, automatic vectorization, and parallelization. To learn about
|
||||
JAX, including profiling and optimizations, see the official `JAX documentation
|
||||
<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
ROCm support for JAX is upstreamed, and users can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax
|
||||
<https://github.com/ROCm/rocm-jax>`__ repository, which differs from the
|
||||
`https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.
|
||||
- ROCm JAX release:
|
||||
|
||||
- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`,
|
||||
which include ROCm, JAX, and all required dependencies.
|
||||
- Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
|
||||
with ROCm and JAX preinstalled.
|
||||
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
for installation and setup instructions.
|
||||
- ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__
|
||||
for additional context.
|
||||
- See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
|
||||
to get started.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
- Official JAX release:
|
||||
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
- Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
|
||||
|
||||
JAX Plugin-PJRT with JAX/JAXLIB compatibility
|
||||
================================================================================
|
||||
- See the `AMD GPU (Linux) installation section
|
||||
<https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
|
||||
the JAX documentation.
|
||||
|
||||
Portable JIT Runtime (PJRT) is an open, stable interface for device runtime and
|
||||
compiler. The following table details the ROCm version compatibility matrix
|
||||
between JAX Plugin–PJRT and JAX/JAXLIB.
|
||||
.. note::
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - JAX Plugin-PJRT
|
||||
- JAX/JAXLIB
|
||||
- ROCm
|
||||
* - 0.7.1
|
||||
- 0.7.1
|
||||
- 7.1.1, 7.1.0
|
||||
* - 0.6.0
|
||||
- 0.6.2, 0.6.0
|
||||
- 7.0.2, 7.0.1, 7.0.0
|
||||
AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
follow upstream JAX releases and use the latest available ROCm version.
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -87,7 +71,7 @@ Use cases and recommendations
|
||||
* The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
|
||||
outlines the process of fine-tuning a Bidirectional Encoder Representations
|
||||
from Transformers (BERT)-based large language model (LLM) using JAX for a text
|
||||
classification task. The blog post discusses techniques for parallelizing the
|
||||
classification task. The blog post discuss techniques for parallelizing the
|
||||
fine-tuning across multiple AMD GPUs and assess the model's performance on a
|
||||
holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
|
||||
and the General Language Understanding Evaluation (GLUE) benchmark dataset was
|
||||
@@ -106,9 +90,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
|
||||
AMD provides preconfigured Docker images with JAX and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
|
||||
recommended way to get started with deep learning with JAX on ROCm.
|
||||
For ``jax-community`` images, see `rocm/jax-community
|
||||
<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
|
||||
|
||||
@@ -250,7 +234,7 @@ The ROCm supported data types in JAX are collected in the following table.
|
||||
|
||||
.. note::
|
||||
|
||||
JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
|
||||
JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
|
||||
collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
|
||||
page.
|
||||
|
||||
@@ -269,33 +253,6 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
|
||||
JAX API modules are maintained by the JAX project and is subject to change.
|
||||
Refer to the official Jax documentation for the most up-to-date information.
|
||||
|
||||
Key features and enhancements for ROCm 7.1
|
||||
===============================================================================
|
||||
|
||||
- Enabled compilation of multihost HLO runner Python bindings.
|
||||
|
||||
- Backported multihost HLO runner bindings and some related changes to
|
||||
:code:`FunctionalHloRunner`.
|
||||
|
||||
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
||||
|
||||
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
||||
|
||||
|
||||
- ROCprofiler-SDK integration:
|
||||
|
||||
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
||||
support both time-based and step-based profiling.
|
||||
|
||||
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
||||
|
||||
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
||||
rounding mode.
|
||||
|
||||
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
||||
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
||||
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
||||
|
||||
Key features and enhancements for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: llama.cpp compatibility
|
||||
:keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility
|
||||
:description: llama.cpp deep learning framework compatibility
|
||||
:keywords: GPU, GGML, llama.cpp compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -20,32 +20,33 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first
|
||||
llama.cpp is easy to integrate with other programming environments and is widely
|
||||
adopted across diverse platforms, including consumer devices.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
ROCm support for llama.cpp is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`__ repository, which differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.
|
||||
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`_ repository.
|
||||
|
||||
- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`,
|
||||
which include ROCm, llama.cpp, and all required dependencies.
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
|
||||
|
||||
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
|
||||
which includes ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
for installation and setup instructions.
|
||||
to install and get started.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__
|
||||
for additional context.
|
||||
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
|
||||
in the upstream llama.cpp documentation.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. note::
|
||||
|
||||
llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
|
||||
`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI325X, MI300X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -83,9 +84,9 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
|
||||
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest available llama.cpp versions from the official Docker Hub.
|
||||
inventories represent the available llama.cpp versions from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. important::
|
||||
@@ -109,27 +110,27 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_full/images/sha256-a94f0c7a598cc6504ff9e8371c016d7a2f93e69bf54a36c870f9522567201f10g"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server/images/sha256-be175932c3c96e882dfbc7e20e0e834f58c89c2925f48b222837ee929dfc47ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_light/images/sha256-d8ba0c70603da502c879b1f8010b439c8e7fa9f6cbdac8bbbbbba97cb41ebc9e"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 24.04
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_full/images/sha256-37582168984f25dce636cc7288298e06d94472ea35f65346b3541e6422b678ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_server/images/sha256-7e70578e6c3530c6591cc2c26da24a9ee68a20d318e12241de93c83224f83720"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_light/images/sha256-9a5231acf88b4a229677bc2c636ea3fe78a7a80f558bd80910b919855de93ad5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 22.04
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Megablocks compatibility
|
||||
:keywords: GPU, megablocks, deep learning, framework compatibility
|
||||
:keywords: GPU, megablocks, compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,42 +10,28 @@
|
||||
Megablocks compatibility
|
||||
********************************************************************************
|
||||
|
||||
`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library
|
||||
for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training.
|
||||
Megablocks is a light-weight library for mixture-of-experts (MoE) training.
|
||||
The core of the system is efficient "dropless-MoE" and standard MoE layers.
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM
|
||||
<https://github.com/stanford-futuredata/Megatron-LM>`__,
|
||||
Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_,
|
||||
where data and pipeline parallel training of MoEs is supported.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled.
|
||||
* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
|
||||
|
||||
- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks
|
||||
<https://github.com/ROCm/megablocks>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
.. note::
|
||||
|
||||
- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`,
|
||||
which includes ROCm, Megablocks, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
Megablocks is supported on ROCm 6.3.0.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
|
||||
|
||||
Supported models and features
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
This section summarizes the Megablocks features supported by ROCm.
|
||||
|
||||
@@ -55,28 +41,20 @@ This section summarizes the Megablocks features supported by ROCm.
|
||||
* Mixture-of-Experts
|
||||
* dropless-Mixture-of-Experts
|
||||
|
||||
|
||||
.. _megablocks-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
|
||||
The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_
|
||||
guide how to leverage the ROCm platform for pre-training using the Megablocks framework.
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
@@ -86,9 +64,10 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available Megablocks version from the official Docker Hub.
|
||||
AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: PyTorch compatibility
|
||||
:keywords: GPU, PyTorch, deep learning, framework compatibility
|
||||
:keywords: GPU, PyTorch compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -15,42 +15,40 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
|
||||
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
|
||||
|
||||
PyTorch provides two high-level features:
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
|
||||
to independent compatibility considerations, this results in two distinct
|
||||
release cycles for PyTorch on ROCm:
|
||||
|
||||
- Tensor computation (like NumPy) with strong GPU acceleration
|
||||
- ROCm PyTorch release:
|
||||
|
||||
- Deep neural networks built on a tape-based autograd system (rapid computation
|
||||
of multiple partial derivatives or gradients)
|
||||
- Provides the latest version of ROCm but might not necessarily support the
|
||||
latest stable PyTorch version.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||
preinstalled.
|
||||
|
||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository.
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency:
|
||||
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
|
||||
|
||||
- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch
|
||||
<https://github.com/ROCm/pytorch>`__ repository, which differs from the
|
||||
`https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
to get started.
|
||||
|
||||
- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`,
|
||||
which include ROCm, PyTorch, and all required dependencies.
|
||||
- Official PyTorch release:
|
||||
|
||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||
for installation and setup instructions.
|
||||
- Provides the latest stable version of PyTorch but might not necessarily
|
||||
support the latest ROCm version.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or
|
||||
`Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.
|
||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||
|
||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
|
||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
|
||||
to get started.
|
||||
|
||||
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
||||
This approach allows PyTorch to support ROCm without requiring manual code
|
||||
modifications. For more information, see :doc:`HIPIFY <hipify:index>`.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
|
||||
quarterly alongside new ROCm releases. These images undergo full AMD testing.
|
||||
ROCm development is aligned with the stable release of PyTorch, while upstream
|
||||
PyTorch testing uses the stable release of ROCm to maintain consistency.
|
||||
|
||||
.. _pytorch-recommendations:
|
||||
|
||||
@@ -80,7 +78,7 @@ Use cases and recommendations
|
||||
GPU.
|
||||
|
||||
* The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
|
||||
describes how PyTorch integrates with ROCm for AI workloads. It outlines the
|
||||
describes how PyTorch integrates with ROCm for AI workloads It outlines the
|
||||
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
||||
GPU hardware for training and inference tasks in AI applications.
|
||||
|
||||
@@ -91,8 +89,9 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
|
||||
with ROCm backends on Docker Hub.
|
||||
AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
|
||||
recommended way to get started with deep learning with PyTorch on ROCm.
|
||||
|
||||
To find the right image tag, see the :ref:`PyTorch on ROCm installation
|
||||
documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
|
||||
@@ -361,6 +360,15 @@ with ROCm.
|
||||
popular datasets, model architectures, and common image transformations
|
||||
for computer vision applications.
|
||||
|
||||
* - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
|
||||
- Text processing library for PyTorch. Provides data processing utilities
|
||||
and popular datasets for natural language processing, including
|
||||
tokenization, vocabulary management, and text embeddings.
|
||||
|
||||
**Note:** ``torchtext`` does not implement ROCm-specific kernels.
|
||||
ROCm acceleration is provided through the underlying PyTorch framework
|
||||
and ROCm library integration. Only official release exists.
|
||||
|
||||
* - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
|
||||
- Beta library of common modular data loading primitives for easily
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
@@ -399,40 +407,7 @@ with ROCm.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
|
||||
================================================================================
|
||||
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b.
|
||||
|
||||
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later.
|
||||
|
||||
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization.
|
||||
|
||||
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations.
|
||||
|
||||
- Improved tensor.item() performance by removing redundant synchronization.
|
||||
|
||||
- Enhanced performance for element-wise operations and reduction kernels.
|
||||
|
||||
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components.
|
||||
|
||||
- Resolved device error in Inductor when using CUDA graph trees with HIP.
|
||||
|
||||
- Corrected logsumexp scaling in AOTriton-based SDPA implementation.
|
||||
|
||||
- Added stream graph capture status validation in memory copy synchronization functions.
|
||||
|
||||
Key features and enhancements for PyTorch 2.8 with ROCm 7.1
|
||||
================================================================================
|
||||
|
||||
- MIOpen deep learning optimizations: Further optimized NHWC BatchNorm feature.
|
||||
|
||||
- Added float8 support for the DeepSpeed extension, allowing for decreased
|
||||
memory footprint and increased throughput in training and inference workloads.
|
||||
|
||||
- ``torch.nn.functional.scaled_dot_product_attention`` now calling optimized
|
||||
flash attention kernel automatically.
|
||||
|
||||
Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
|
||||
Key features and enhancements for PyTorch 2.7 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
|
||||
@@ -467,6 +442,10 @@ Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
|
||||
ROCm-specific test conditions, and enhanced unit test coverage for Flash
|
||||
Attention and Memory Efficient operations.
|
||||
|
||||
- Build system and infrastructure improvements: Provides updated CentOS Stream 9
|
||||
support, improved Docker configuration, migration to public MAGMA repository,
|
||||
and enhanced QA automation scripts for PyTorch unit testing.
|
||||
|
||||
- Composable Kernel (CK) updates: Features updated CK submodule integration with
|
||||
the latest optimizations and performance improvements for core mathematical
|
||||
operations.
|
||||
@@ -488,7 +467,7 @@ Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
|
||||
network training or inference. For AMD platforms, ``amdclang++`` has been
|
||||
validated as the supported compiler for building these extensions.
|
||||
|
||||
Known issues and notes for PyTorch 2.7/2.8 with ROCm 7.0 and ROCm 7.1
|
||||
Known issues and notes for PyTorch 2.7 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Ray compatibility
|
||||
:keywords: GPU, Ray, deep learning, framework compatibility
|
||||
:description: Ray deep learning framework compatibility
|
||||
:keywords: GPU, Ray compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -19,36 +19,37 @@ simplifying machine learning computations.
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
ROCm support for Ray is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`__ repository, which differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.
|
||||
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`_ repository.
|
||||
|
||||
- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`,
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
|
||||
|
||||
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels
|
||||
<https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for instructions to get started.
|
||||
|
||||
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
|
||||
in the upstream Ray documentation.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for installation and setup instructions.
|
||||
.. note::
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
Ray is supported on ROCm 6.4.1.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
@@ -87,15 +88,15 @@ Docker image compatibility
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
@@ -104,7 +105,6 @@ Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Stanford Megatron-LM compatibility
|
||||
:keywords: Stanford, Megatron-LM, deep learning, framework compatibility
|
||||
:keywords: Stanford, Megatron-LM, compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,50 +10,34 @@
|
||||
Stanford Megatron-LM compatibility
|
||||
********************************************************************************
|
||||
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed
|
||||
by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_.
|
||||
It is designed to train massive transformer-based language models efficiently by model
|
||||
and data parallelism.
|
||||
Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
|
||||
designed to train massive transformer-based language models efficiently by model and data parallelism.
|
||||
|
||||
It provides efficient tensor, pipeline, and sequence-based model parallelism for
|
||||
pre-training transformer-based language models such as GPT (Decoder Only), BERT
|
||||
(Encoder Only), and T5 (Encoder-Decoder).
|
||||
* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled.
|
||||
* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
|
||||
|
||||
Support overview
|
||||
.. note::
|
||||
|
||||
Stanford Megatron-LM is supported on ROCm 6.3.0.
|
||||
|
||||
|
||||
Supported Devices
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM
|
||||
<https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the
|
||||
`https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.
|
||||
- **Officially Supported**: AMD Instinct MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
|
||||
|
||||
- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`,
|
||||
which includes ROCm, Stanford Megatron-LM, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
|
||||
Supported models and features
|
||||
--------------------------------------------------------------------------------
|
||||
================================================================================
|
||||
|
||||
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
|
||||
|
||||
Models:
|
||||
|
||||
* BERT
|
||||
* Bert
|
||||
* GPT
|
||||
* T5
|
||||
* ICT
|
||||
@@ -70,24 +54,13 @@ Features:
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:
|
||||
See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post
|
||||
to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs.
|
||||
Coverage includes:
|
||||
|
||||
* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__
|
||||
blog post guides how to leverage the ROCm platform for pre-training using the
|
||||
Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts
|
||||
(MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it
|
||||
demonstrates how block-sparse computations can enhance scalability and efficiency in MoE
|
||||
training. The guide provides step-by-step instructions for setting up the environment,
|
||||
including cloning the repository, building the Docker image, and running the training container.
|
||||
Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training
|
||||
language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE
|
||||
training workflows for large-scale transformer models.
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
@@ -98,9 +71,10 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
|
||||
inventories represent the latest Megatron-LM version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
@@ -108,7 +82,6 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
@@ -118,7 +91,6 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: Taichi compatibility
|
||||
:keywords: GPU, Taichi, deep learning, framework compatibility
|
||||
:keywords: GPU, Taichi compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -19,52 +19,28 @@ Taichi is widely used across various domains, including real-time physical simul
|
||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
||||
visual effects in film and gaming, and general-purpose computing.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
|
||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
|
||||
* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
|
||||
* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
|
||||
|
||||
- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi
|
||||
<https://github.com/ROCm/taichi>`__ repository, which differs from the
|
||||
`https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
|
||||
.. note::
|
||||
|
||||
- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`,
|
||||
which includes ROCm, Taichi, and all required dependencies.
|
||||
Taichi is supported on ROCm 6.3.2.
|
||||
|
||||
- See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
|
||||
- **Upcoming Support**: AMD Instinct™ MI300X
|
||||
Supported devices and features
|
||||
===============================================================================
|
||||
There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
|
||||
AMD Instinct MI300X Series GPUs will be supported by November.
|
||||
|
||||
.. _taichi-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
|
||||
blog highlights Taichi as an open-source programming language designed for high-performance
|
||||
numerical computation, particularly in domains like real-time physical simulation,
|
||||
artificial intelligence, computer vision, robotics, and visual effects. Taichi
|
||||
is embedded in Python and uses just-in-time (JIT) compilation frameworks like
|
||||
LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
|
||||
of Taichi in enabling complex simulations and numerical algorithms, making
|
||||
it ideal for developers working on compute-intensive tasks. Developers are
|
||||
encouraged to follow recommended coding patterns and utilize Taichi decorators
|
||||
for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples
|
||||
<https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images
|
||||
integrating ROCm, PyTorch, and Taichi are provided for simplified installation
|
||||
and deployment, making it easier to leverage Taichi for advanced computational workloads.
|
||||
To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators.
|
||||
A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository,
|
||||
providing practical insights and foundational knowledge for working with the Taichi programming language.
|
||||
You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
|
||||
|
||||
.. _taichi-docker-compat:
|
||||
|
||||
@@ -76,8 +52,9 @@ Docker image compatibility
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories
|
||||
represent the latest Taichi version from the official Docker Hub.
|
||||
The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: TensorFlow compatibility
|
||||
:keywords: GPU, TensorFlow, deep learning, framework compatibility
|
||||
:keywords: GPU, TensorFlow compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -12,33 +12,37 @@ TensorFlow compatibility
|
||||
|
||||
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
|
||||
solving machine learning, deep learning, and AI problems. It can solve many
|
||||
problems across different sectors and industries, but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular deep
|
||||
learning frameworks and is very active in open-source development.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream
|
||||
<https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the
|
||||
`https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
|
||||
|
||||
- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`,
|
||||
which include ROCm, TensorFlow, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
problems across different sectors and industries but primarily focuses on
|
||||
neural network training and inference. It is one of the most popular and
|
||||
in-demand frameworks and is very active in open-source contribution and
|
||||
development.
|
||||
|
||||
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
|
||||
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
||||
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
|
||||
fixes, updates, and support for the latest ROCm versions.
|
||||
fixes, updates, and support for the latest ROCM versions.
|
||||
|
||||
- ROCm TensorFlow release:
|
||||
|
||||
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
|
||||
ROCm and TensorFlow pre-installed.
|
||||
|
||||
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
|
||||
|
||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||
to get started.
|
||||
|
||||
- Official TensorFlow release:
|
||||
|
||||
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
|
||||
|
||||
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
|
||||
|
||||
.. note::
|
||||
|
||||
The official TensorFlow documentation does not cover ROCm support. Use the
|
||||
ROCm documentation for installation instructions for Tensorflow on ROCm.
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
|
||||
|
||||
.. _tensorflow-docker-compat:
|
||||
|
||||
@@ -136,7 +140,7 @@ The following section maps supported data types and GPU-accelerated TensorFlow
|
||||
features to their minimum supported ROCm and TensorFlow versions.
|
||||
|
||||
Data types
|
||||
---------------
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or
|
||||
argument, and TensorFlow supports a wide range of data types for different use
|
||||
@@ -254,7 +258,7 @@ are as follows:
|
||||
- 1.7
|
||||
|
||||
Features
|
||||
---------------
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This table provides an overview of key features in TensorFlow and their
|
||||
availability in ROCm.
|
||||
@@ -346,7 +350,7 @@ availability in ROCm.
|
||||
- 1.9.2
|
||||
|
||||
Distributed library features
|
||||
-----------------------------------
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Enables developers to scale computations across multiple devices on a single machine or
|
||||
across multiple machines.
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
.. meta::
|
||||
:description: verl compatibility
|
||||
:keywords: GPU, verl, deep learning, framework compatibility
|
||||
:keywords: GPU, verl compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
@@ -10,58 +10,24 @@
|
||||
verl compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)
|
||||
is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model
|
||||
that makes it easy to define and run complex post-training dataflows efficiently.
|
||||
Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs).
|
||||
verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
|
||||
|
||||
Its modular APIs separate computation from data, allowing smooth integration with other frameworks.
|
||||
It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
|
||||
verl achieves high training and generation throughput by building on existing LLM frameworks.
|
||||
Its 3D-HybridEngine reduces memory use and communication overhead when switching between training
|
||||
and inference, improving overall performance.
|
||||
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
|
||||
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
|
||||
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
|
||||
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
.. note::
|
||||
|
||||
- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl
|
||||
<https://github.com/ROCm/verl>`__ repository, which differs from the
|
||||
`https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
|
||||
|
||||
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
|
||||
which includes ROCm, verl, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
verl is supported on ROCm 6.2.0.
|
||||
|
||||
.. _verl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The benefits of verl in large-scale reinforcement learning from human feedback
|
||||
(RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD
|
||||
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
|
||||
(verl) framework integrates with the AMD ROCm platform to optimize training on
|
||||
Instinct™ MI300X GPUs. The guide details the process of building a Docker image,
|
||||
setting up single-node and multi-node training environments, and highlights
|
||||
performance benchmarks demonstrating improved throughput and convergence accuracy.
|
||||
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
|
||||
facilitating efficient RLHF training workflows.
|
||||
The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
@@ -95,10 +61,8 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
@@ -34,7 +34,7 @@ Runtime
|
||||
|
||||
```{code-block} shell
|
||||
:caption: Example to expose the 1. device and a device based on UUID.
|
||||
export ROCR_VISIBLE_DEVICES="0,GPU-4b2c1a9f-8d3e-6f7a-b5c9-2e4d8a1f6c3b"
|
||||
export ROCR_VISIBLE_DEVICES="0,GPU-DEADBEEFDEADBEEF"
|
||||
```
|
||||
|
||||
### `GPU_DEVICE_ORDINAL`
|
||||
|
||||
23
docs/conf.py
23
docs/conf.py
@@ -8,7 +8,6 @@ import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from subprocess import run
|
||||
|
||||
gh_release_path = os.path.join("..", "RELEASE.md")
|
||||
gh_changelog_path = os.path.join("..", "CHANGELOG.md")
|
||||
@@ -81,27 +80,24 @@ latex_elements = {
|
||||
}
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
|
||||
html_context = {"docs_header_version": "7.1.1"}
|
||||
html_context = {}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
# Check if the branch is a docs/ branch
|
||||
official_branch = run(["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True).stdout.find("docs/")
|
||||
|
||||
# configurations for PDF output by Read the Docs
|
||||
project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.1.1"
|
||||
release = "7.1.1"
|
||||
version = "7.0.2"
|
||||
release = "7.0.2"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-11-26"},
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-10-10"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
@@ -206,7 +202,7 @@ external_toc_path = "./sphinx/_toc.yml"
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "remote-content", "version-ref", "csv-to-list-table"]
|
||||
extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref", "csv-to-list-table"]
|
||||
|
||||
compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')
|
||||
|
||||
@@ -216,14 +212,10 @@ external_projects_current_project = "rocm"
|
||||
# external_projects_remote_repository = ""
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
|
||||
html_context = {"docs_header_version": "7.1.0"}
|
||||
html_context = {}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
html_context["official_branch"] = official_branch
|
||||
html_context["version"] = version
|
||||
html_context["release"] = release
|
||||
|
||||
html_theme = "rocm_docs_theme"
|
||||
html_theme_options = {"flavor": "rocm-docs-home"}
|
||||
|
||||
@@ -249,6 +241,3 @@ html_context = {
|
||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||
}
|
||||
|
||||
# Disable figure and table numbering
|
||||
numfig = False
|
||||
|
||||
@@ -1,316 +0,0 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -1,13 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
|
||||
vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
|
||||
commit: 790d22168820507f3105fef29596549378cfe399
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
from docutils import nodes
|
||||
from docutils.parsers.rst import Directive
|
||||
from docutils.statemachine import ViewList
|
||||
from sphinx.util import logging
|
||||
from sphinx.util.nodes import nested_parse_with_titles
|
||||
import requests
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BranchAwareRemoteContent(Directive):
|
||||
"""
|
||||
Directive that downloads and includes content from other repositories,
|
||||
matching the branch/tag of the current documentation build.
|
||||
|
||||
Usage:
|
||||
.. remote-content::
|
||||
:repo: owner/repository
|
||||
:path: path/to/file.rst
|
||||
:default_branch: docs/develop # Branch to use when not on a release
|
||||
:tag_prefix: Docs/ # Optional
|
||||
"""
|
||||
|
||||
required_arguments = 0
|
||||
optional_arguments = 0
|
||||
final_argument_whitespace = True
|
||||
has_content = False
|
||||
option_spec = {
|
||||
'repo': str,
|
||||
'path': str,
|
||||
'default_branch': str, # Branch to use when not on a release tag
|
||||
'start_line': int, # Include the file from a specific line
|
||||
'tag_prefix': str, # Prefix for release tags (e.g., 'Docs/')
|
||||
}
|
||||
|
||||
def get_current_version(self):
|
||||
"""Get current version/branch being built"""
|
||||
env = self.state.document.settings.env
|
||||
html_context = env.config.html_context
|
||||
|
||||
# Check if building from a tag
|
||||
if "official_branch" in html_context:
|
||||
if html_context["official_branch"] == 0:
|
||||
if "version" in html_context:
|
||||
# Remove any 'v' prefix
|
||||
version = html_context["version"]
|
||||
if re.match(r'^\d+\.\d+\.\d+$', version):
|
||||
return version
|
||||
|
||||
# Not a version tag, so we'll use the default branch
|
||||
return None
|
||||
|
||||
def get_target_ref(self):
|
||||
"""Get target reference for the remote repository"""
|
||||
current_version = self.get_current_version()
|
||||
|
||||
# If it's a version number, use tag prefix and version
|
||||
if current_version:
|
||||
tag_prefix = self.options.get('tag_prefix', '')
|
||||
return f'{tag_prefix}{current_version}'
|
||||
|
||||
# For any other case, use the specified default branch
|
||||
if 'default_branch' not in self.options:
|
||||
logger.warning('No default_branch specified and not building from a version tag')
|
||||
return None
|
||||
|
||||
return self.options['default_branch']
|
||||
|
||||
def construct_raw_url(self, repo, path, ref):
|
||||
"""Construct the raw.githubusercontent.com URL"""
|
||||
return f'https://raw.githubusercontent.com/{repo}/{ref}/{path}'
|
||||
|
||||
def fetch_and_parse_content(self, url, source_path):
|
||||
"""Fetch content and parse it as RST"""
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
content = response.text
|
||||
|
||||
start_line = self.options.get('start_line', 0)
|
||||
|
||||
# Create ViewList for parsing
|
||||
line_count = 0
|
||||
content_list = ViewList()
|
||||
for line_no, line in enumerate(content.splitlines()):
|
||||
if line_count >= start_line:
|
||||
content_list.append(line, source_path, line_no)
|
||||
line_count+=1
|
||||
|
||||
# Create a section node and parse content
|
||||
node = nodes.section()
|
||||
nested_parse_with_titles(self.state, content_list, node)
|
||||
|
||||
return node.children
|
||||
|
||||
def run(self):
|
||||
if 'repo' not in self.options or 'path' not in self.options:
|
||||
logger.warning('Both repo and path options are required')
|
||||
return []
|
||||
|
||||
target_ref = self.get_target_ref()
|
||||
if not target_ref:
|
||||
return []
|
||||
|
||||
raw_url = self.construct_raw_url(
|
||||
self.options['repo'],
|
||||
self.options['path'],
|
||||
target_ref
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f'Attempting to fetch content from {raw_url}')
|
||||
return self.fetch_and_parse_content(raw_url, self.options['path'])
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f'Failed to fetch content from {raw_url}: {str(e)}')
|
||||
|
||||
# If we failed on a tag, try falling back to default_branch
|
||||
if re.match(r'^\d+\.\d+\.\d+$', target_ref) or target_ref.startswith('Docs/'):
|
||||
if 'default_branch' in self.options:
|
||||
try:
|
||||
fallback_ref = self.options['default_branch']
|
||||
logger.info(f'Attempting fallback to {fallback_ref}...')
|
||||
|
||||
fallback_url = self.construct_raw_url(
|
||||
self.options['repo'],
|
||||
self.options['path'],
|
||||
fallback_ref
|
||||
)
|
||||
|
||||
return self.fetch_and_parse_content(fallback_url, self.options['path'])
|
||||
except requests.exceptions.RequestException as e2:
|
||||
logger.warning(f'Fallback also failed: {str(e2)}')
|
||||
|
||||
return []
|
||||
|
||||
def setup(app):
|
||||
app.add_directive('remote-content', BranchAwareRemoteContent)
|
||||
|
||||
return {
|
||||
'parallel_read_safe': True,
|
||||
'parallel_write_safe': True,
|
||||
}
|
||||
@@ -84,8 +84,6 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -15,9 +15,10 @@ using PyTorch. It delves into specific workloads such as
|
||||
:ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
|
||||
enhance efficiency.
|
||||
|
||||
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
|
||||
well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
|
||||
for meticulous tuning.
|
||||
The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
|
||||
that streamline optimization as well as advanced techniques like
|
||||
:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
|
||||
meticulous tuning.
|
||||
|
||||
Workload tuning strategy
|
||||
========================
|
||||
@@ -85,28 +86,27 @@ Optimize model inference with vLLM
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
vLLM provides tools and techniques specifically designed for efficient model
|
||||
inference on AMD Instinct GPUs. See the official `vLLM installation docs
|
||||
<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
|
||||
installation guidance. Optimizing performance with vLLM involves configuring
|
||||
tensor parallelism, leveraging advanced features, and ensuring efficient
|
||||
execution.
|
||||
inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
|
||||
for installation guidance. Optimizing performance with vLLM
|
||||
involves configuring tensor parallelism, leveraging advanced features, and
|
||||
ensuring efficient execution. Here’s how to optimize vLLM performance:
|
||||
|
||||
* Configuration for vLLM: Set engine arguments according to workload
|
||||
requirements.
|
||||
* Tensor parallelism: Configure the
|
||||
:ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
|
||||
tensor computations across multiple GPUs. Adjust parameters such as
|
||||
``batch-size``, ``input-len``, and ``output-len`` based on your workload.
|
||||
|
||||
* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
|
||||
according to workload requirements. Benchmark performance to understand
|
||||
characteristics and identify bottlenecks.
|
||||
|
||||
* Benchmarking and performance metrics: Measure latency and throughput to
|
||||
evaluate performance.
|
||||
|
||||
.. seealso::
|
||||
|
||||
See :doc:`vllm-optimization` to learn more about vLLM performance
|
||||
optimization techniques.
|
||||
|
||||
.. _mi300x-auto-tune:
|
||||
|
||||
Auto-tunable configurations
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Auto-tunable configurations can significantly streamline performance
|
||||
optimization by automatically adjusting parameters based on workload
|
||||
characteristics. For example:
|
||||
@@ -120,7 +120,8 @@ characteristics. For example:
|
||||
your specific hardware.
|
||||
|
||||
* Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
|
||||
to explore various kernel configurations and select the best-performing ones.
|
||||
to explore various kernel configurations and automatically select the
|
||||
best-performing ones.
|
||||
|
||||
Manual tuning
|
||||
^^^^^^^^^^^^^
|
||||
@@ -327,21 +328,380 @@ hardware counters are also included.
|
||||
|
||||
ROCm Systems Profiler timeline trace example.
|
||||
|
||||
.. _mi300x-vllm-optimization:
|
||||
|
||||
vLLM performance optimization
|
||||
=============================
|
||||
|
||||
vLLM is a high-throughput and memory efficient inference and serving engine for
|
||||
large language models that has gained traction in the AI community for its
|
||||
performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
|
||||
how to:
|
||||
vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
|
||||
its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
|
||||
|
||||
Performance environment variables
|
||||
---------------------------------
|
||||
|
||||
The following performance tips are not *specific* to vLLM -- they are general
|
||||
but relevant in this context. You can tune the following vLLM parameters to
|
||||
achieve optimal request latency and throughput performance.
|
||||
|
||||
* As described in `Environment variables (MI300X)
|
||||
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
|
||||
the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
|
||||
performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
|
||||
|
||||
* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
|
||||
to ``112`` to increase the number of channels on MI300X to potentially improve
|
||||
performance.
|
||||
|
||||
* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
|
||||
|
||||
Auto-tuning using PyTorch TunableOp
|
||||
------------------------------------
|
||||
|
||||
Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning.
|
||||
You can run auto-tuning with TunableOp in two simple steps without modifying your code:
|
||||
|
||||
* Enable TunableOp and tuning. Optionally, enable verbose mode:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
|
||||
|
||||
* Enable TunableOp and disable tuning and measure.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
|
||||
|
||||
Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
|
||||
|
||||
Performance tuning based on vLLM engine configurations
|
||||
-------------------------------------------------------
|
||||
|
||||
The following subsections describe vLLM-specific configurations for performance tuning.
|
||||
You can tune the following vLLM parameters to achieve optimal performance.
|
||||
|
||||
* ``tensor_parallel_size``
|
||||
|
||||
* ``gpu_memory_utilization``
|
||||
|
||||
* ``dtype``
|
||||
|
||||
* ``enforce_eager``
|
||||
|
||||
* ``kv_cache_dtype``
|
||||
|
||||
* ``input_len``
|
||||
|
||||
* ``output_len``
|
||||
|
||||
* ``max_num_seqs``
|
||||
|
||||
* ``num_scheduler_steps``
|
||||
|
||||
* ``max_model_len``
|
||||
|
||||
* ``enable_chunked_prefill``
|
||||
|
||||
* ``distributed_executor_backend``
|
||||
|
||||
* ``max_seq_len_to_capture``
|
||||
|
||||
Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
|
||||
for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
|
||||
usage with ROCm.
|
||||
|
||||
ROCm provides a prebuilt optimized Docker image for validating the performance
|
||||
of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
|
||||
ROCm, vLLM, and PyTorch. For more information, see
|
||||
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
|
||||
|
||||
.. _mi300x-vllm-throughput-measurement:
|
||||
|
||||
Evaluating performance by throughput measurement
|
||||
-------------------------------------------------
|
||||
|
||||
This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
|
||||
|
||||
Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
|
||||
Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
|
||||
|
||||
* For realistic performance evaluation, you can use datasets like Hugging Face's
|
||||
``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
|
||||
data, making it a good representation of typical use cases for language models. Download it using
|
||||
the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
* For standardized benchmarking, you can set fixed input and output token
|
||||
lengths. Synthetic prompts provide consistent benchmarking runs, making it
|
||||
easier to compare performance across different models or configurations.
|
||||
Additionally, a controlled environment simplifies analysis.
|
||||
|
||||
By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
|
||||
|
||||
.. _mi300x-vllm-single-node:
|
||||
|
||||
Maximizing vLLM instances on a single node
|
||||
------------------------------------------
|
||||
|
||||
The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
|
||||
However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
|
||||
|
||||
The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
|
||||
For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
|
||||
simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
|
||||
variable ``CUDA_VISIBLE_DEVICES``.
|
||||
|
||||
For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
|
||||
with a model that can fit in one GPU:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
for i in $(seq 0 7);
|
||||
do
|
||||
CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
done
|
||||
|
||||
The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
|
||||
single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
|
||||
using the ``-tp`` N option, where ``1 < N ≤ 8``).
|
||||
|
||||
vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
|
||||
Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
|
||||
|
||||
.. _mi300x-vllm-gpu-memory-utilization:
|
||||
|
||||
Configure the gpu_memory_utilization parameter
|
||||
----------------------------------------------
|
||||
|
||||
There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
|
||||
|
||||
1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
|
||||
it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
|
||||
You can set it to ``>0.9`` and ``<1``.
|
||||
|
||||
For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
/vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
|
||||
|
||||
2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
|
||||
|
||||
Specify GPU memory utilization to run as many instances of vLLM as possible on a single
|
||||
GPU. However, too many instances can result in no memory for KV-cache. For small models, run
|
||||
multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
|
||||
long as it would not cause HIP Out Of Memory.
|
||||
|
||||
For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
|
||||
``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
|
||||
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4
|
||||
--dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
|
||||
|
||||
See :ref:`vllm-engine-args` for other performance suggestions.
|
||||
|
||||
.. _mi300x-vllm-multiple-gpus:
|
||||
|
||||
Run vLLM on multiple GPUs
|
||||
-------------------------
|
||||
|
||||
The two main reasons to use multiple GPUs are:
|
||||
|
||||
* The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
|
||||
|
||||
* To achieve better latency when using a single GPU is not desirable.
|
||||
|
||||
To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
|
||||
specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
|
||||
the GPUs.
|
||||
|
||||
For example, you can use two GPUs to start an API server on port 8000:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python -m vllm.entrypoints.api_server --model /path/to/model --dtype
|
||||
float16 -tp 2 --port 8000 &
|
||||
|
||||
To achieve both latency and throughput performance for serving, you can run multiple API servers on
|
||||
different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
|
||||
specify the GPUs for each server, for example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
|
||||
/path/to/model --dtype float16 -tp 2 --port 8000 &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
|
||||
/path/to/model --dtype float16 -tp 2 --port 8001 &
|
||||
|
||||
Choose an attention backend
|
||||
---------------------------
|
||||
|
||||
vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
|
||||
requirements:
|
||||
|
||||
- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
|
||||
least once as a warm-up step so Triton can perform auto-tuning before
|
||||
collecting benchmarking numbers. This is the default setting.
|
||||
|
||||
- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
|
||||
the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
|
||||
|
||||
|
||||
Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
|
||||
to learn more about Flash Attention with Triton or CK backends.
|
||||
|
||||
.. _vllm-engine-args:
|
||||
|
||||
vLLM engine arguments
|
||||
---------------------
|
||||
|
||||
The following are configuration suggestions to potentially improve performance with vLLM. See
|
||||
`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
|
||||
for a full list of configurable engine arguments.
|
||||
|
||||
Configure the max-num-seqs parameter
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
|
||||
512``). This increases the maximum number of sequences per iteration and can improve throughput.
|
||||
|
||||
Use the float16 dtype
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
|
||||
Use float16 (``--dtype float16``) for better performance.
|
||||
|
||||
Multi-step scheduling
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
|
||||
|
||||
Distributed executor backend
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
|
||||
backend (``--distributed_executor_backend mp``) is recommended.
|
||||
|
||||
Graph mode max-seq-len-to-capture
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
|
||||
larger than this, vLLM engine falls back to eager mode. The default is 8192.
|
||||
|
||||
When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
|
||||
See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
|
||||
|
||||
An example of long context length model is Qwen2-7b.
|
||||
|
||||
Whether to enable chunked prefill
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Another vLLM performance tip is to enable chunked prefill to improve
|
||||
throughput. Chunked prefill allows large prefills to be chunked into
|
||||
smaller chunks and batched together with decode requests.
|
||||
|
||||
You can enable the feature by specifying ``--enable-chunked-prefill`` in the
|
||||
command line or setting ``enable_chunked_prefill=True`` in the LLM
|
||||
constructor.
|
||||
|
||||
As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
|
||||
you can tune the performance by changing ``max_num_batched_tokens``. By
|
||||
default, it is set to 512 and optimized for ITL (inter-token latency).
|
||||
Smaller ``max_num_batched_tokens`` achieves better ITL because there are
|
||||
fewer prefills interrupting decodes.
|
||||
Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
|
||||
token) as you can put more prefill to the batch.
|
||||
|
||||
You might experience noticeable throughput improvements when
|
||||
benchmarking on a single GPU or 8 GPUs using the vLLM throughput
|
||||
benchmarking script along with the ShareGPT dataset as input.
|
||||
|
||||
In the case of fixed ``input-len``/``output-len``, for some configurations,
|
||||
enabling chunked prefill increases the throughput. For some other
|
||||
configurations, the throughput may be worse and elicit a need to tune
|
||||
parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
|
||||
|
||||
.. note::
|
||||
|
||||
Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
|
||||
|
||||
Quantization support
|
||||
---------------------
|
||||
|
||||
Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
|
||||
``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
|
||||
|
||||
FP8 quantization
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
|
||||
Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
|
||||
|
||||
AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
|
||||
|
||||
* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
|
||||
* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
|
||||
* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
|
||||
* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
|
||||
* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
|
||||
|
||||
To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
|
||||
|
||||
AWQ quantization
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
|
||||
that AWQ support in vLLM is currently underoptimized.
|
||||
|
||||
To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
|
||||
|
||||
You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
|
||||
|
||||
fp8 kv-cached-dtype
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
|
||||
of ``kv-cache``. As a result, it reduces the cost required for reading and
|
||||
writing the ``kv-cache``.
|
||||
|
||||
To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
|
||||
|
||||
To specify the quantization scaling config, use the
|
||||
``--quantization-param-path`` parameter. If the parameter is not specified,
|
||||
the default scaling factor of ``1`` is used, which can lead to less accurate
|
||||
results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
|
||||
Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
|
||||
in the vLLM GitHub repository.
|
||||
|
||||
Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
|
||||
``llama2-7b``.
|
||||
|
||||
If building the vLLM using
|
||||
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
|
||||
for ``llama2-70b`` scale config, find the file at
|
||||
``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
|
||||
runtime.
|
||||
|
||||
Below is a sample command to run benchmarking with this feature enabled
|
||||
for the ``llama2-70b`` model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
|
||||
/path/to/llama2-70b-model --kv-cache-dtype "fp8" \
|
||||
--quantization-param-path \
|
||||
"/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
|
||||
--input-len 512 --output-len 256 --num-prompts 500
|
||||
|
||||
* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
|
||||
* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
|
||||
* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
|
||||
* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
|
||||
* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
|
||||
* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
|
||||
* Benchmark and scale across single-node and multi-node configurations.
|
||||
|
||||
.. _mi300x-tunableop:
|
||||
|
||||
@@ -586,33 +946,33 @@ for details.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
|
||||
HIP_FORCE_DEV_KERNARG=1 hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
|
||||
--a_type f16_r --b_type f8_r --compute_type f32_f16_r \
|
||||
--initialization trig_float --cold_iters 100 --iters 1000 --rotating 256
|
||||
--initialization trig_float --cold_iters 100 --iters 1000 --rotating 256
|
||||
|
||||
* Example 2: Benchmark forward epilogues and backward epilogues
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
|
||||
* ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
|
||||
* ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
|
||||
* ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
|
||||
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";``
|
||||
* ``HIPBLASLT_EPILOGUE_BGRADB: "--bias_vector --gradient --bias_source b";``
|
||||
|
||||
|
||||
hipBLASLt auto-tuning using hipblaslt-bench
|
||||
@@ -671,26 +1031,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
defaultBenchOptions = {"ProblemType": {
|
||||
"TransposeA": 0,
|
||||
"TransposeB": 0,
|
||||
"ComputeInputDataType": "s",
|
||||
"ComputeDataType": "s",
|
||||
"DataTypeC": "s",
|
||||
"DataTypeD": "s",
|
||||
"UseBias": False
|
||||
}, "TestConfig": {
|
||||
"ColdIter": 20,
|
||||
"Iter": 100,
|
||||
"AlgoMethod": "all",
|
||||
"RequestedSolutions": 2, # Only works in AlgoMethod heuristic
|
||||
"SolutionIndex": None, # Only works in AlgoMethod index
|
||||
"ApiMethod": "cpp",
|
||||
"RotatingBuffer": 0,
|
||||
}, "TuningParameters": {
|
||||
"SplitK": [0]
|
||||
}, "ProblemSizes": []}
|
||||
defaultCreateLogicOptions = {} # Currently unused
|
||||
defaultBenchOptions = {"ProblemType": {
|
||||
"TransposeA": 0,
|
||||
"TransposeB": 0,
|
||||
"ComputeInputDataType": "s",
|
||||
"ComputeDataType": "s",
|
||||
"DataTypeC": "s",
|
||||
"DataTypeD": "s",
|
||||
"UseBias": False
|
||||
}, "TestConfig": {
|
||||
"ColdIter": 20,
|
||||
"Iter": 100,
|
||||
"AlgoMethod": "all",
|
||||
"RequestedSolutions": 2, # Only works in AlgoMethod heuristic
|
||||
"SolutionIndex": None, # Only works in AlgoMethod index
|
||||
"ApiMethod": "cpp",
|
||||
"RotatingBuffer": 0,
|
||||
}, "TuningParameters": {
|
||||
"SplitK": [0]
|
||||
}, "ProblemSizes": []}
|
||||
defaultCreateLogicOptions = {} # Currently unused
|
||||
|
||||
* ``TestConfig``
|
||||
1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
|
||||
@@ -870,7 +1230,7 @@ command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
merge.py original_dir new_tuned_yaml_dir output_dir
|
||||
merge.py original_dir new_tuned_yaml_dir output_dir
|
||||
|
||||
The following table describes the logic YAML files.
|
||||
|
||||
@@ -1473,7 +1833,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.
|
||||
|
||||
From the IR snippet, you can see ``i32`` data is loaded from global memory to
|
||||
registers (``%190``). With a few element-wise operations in registers, it is
|
||||
stored in shared memory (``%269``) for the transpose operation (``%270``), which
|
||||
stored in shared memory (``%269``) for the transpose operation (``%270``), which
|
||||
needs data movement across different threads. With the transpose done, it is
|
||||
loaded from LDS to register again (``%276``), and with a few more
|
||||
element-wise operations, it is stored to LDS again (``%298``). The last step
|
||||
@@ -1607,7 +1967,7 @@ something similar to the following:
|
||||
loaded at: [0x7fd4f100c000-0x7fd4f100e070]
|
||||
|
||||
The kernel name and the code object file should be listed. In the
|
||||
example above, the kernel name is vector_add_assert_trap, but this might
|
||||
example above, the kernel name is vector_add_assert_trap, but this might
|
||||
also look like:
|
||||
|
||||
.. code-block:: text
|
||||
@@ -1721,8 +2081,3 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
|
||||
configuration to two compute streams and two RCCL streams, aligning with this best practice.
|
||||
Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
|
||||
topology during startup, reducing the need for extensive manual tuning.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
* :doc:`vllm-optimization`
|
||||
|
||||
@@ -1,482 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker-930:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
|
||||
prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
|
||||
GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
|
||||
specifically for AMD data center GPUs and includes the following components:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-930>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||
|
||||
* Added support for AMD Instinct MI355X and MI350X GPUs.
|
||||
|
||||
* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
|
||||
|
||||
* Llama 4 Scout and Maverick
|
||||
|
||||
* DeepSeek R1 0528 FP8
|
||||
|
||||
* MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
|
||||
|
||||
* GPT OSS 20B and 120B
|
||||
|
||||
* Qwen 3 32B, 30B-A3B, and 235B-A22B
|
||||
|
||||
* Removed the deprecated ``--max-seq-len-to-capture`` flag.
|
||||
|
||||
* ``--gpu-memory-utilization`` is now configurable via the `configuration files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
|
||||
repository.
|
||||
|
||||
.. _vllm-benchmark-supported-models-930:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-930:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
documentation might vary by model -- select one to get started. MXFP4 models
|
||||
are only supported on MI355X and MI350X GPUs.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
|
||||
{% if model.precision == "float4" %}
|
||||
.. important::
|
||||
|
||||
MXFP4 is supported only on MI355X and MI350X GPUs.
|
||||
{% endif %}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
{% if model.precision == "float4" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-930:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node with the
|
||||
:literal:`{{model.precision}}` data type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-930>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
{% if model.tunableop %}
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||
operators to find the fastest one for your hardware.
|
||||
|
||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||
the ``--tunableop on`` argument in your run.
|
||||
|
||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||
performance-collection run.
|
||||
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
For more information on configuration, see the `config files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
|
||||
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||
for descriptions of available configuration options
|
||||
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||
additional benchmarking information.
|
||||
|
||||
.. rubric:: Launch the container
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||
in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. rubric:: Throughput command
|
||||
|
||||
Use the following command to start the throughput benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
num_prompts={{ model.config.num_prompts | default(1024) }}
|
||||
in={{ model.config.in | default(128) }}
|
||||
out={{ model.config.in | default(128) }}
|
||||
dtype={{ model.config.dtype | default("auto") }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm bench throughput --model $model \
|
||||
-tp $tp \
|
||||
--num-prompts $num_prompts \
|
||||
--input-len $in \
|
||||
--output-len $out \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--trust-remote-code \
|
||||
--output-json ${model}_throughput.json \
|
||||
--gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
|
||||
|
||||
.. rubric:: Serving command
|
||||
|
||||
1. Start the server using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
dtype={{ model.config.dtype }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs=256
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm serve $model \
|
||||
-tp $tp \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--no-enable-prefix-caching \
|
||||
--swap-space 16 \
|
||||
--disable-log-requests \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
Wait until the model has loaded and the server is ready to accept requests.
|
||||
|
||||
2. On another terminal on the same machine, run the benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Connect to the container
|
||||
docker exec -it test bash
|
||||
|
||||
# Wait for the server to start
|
||||
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||
|
||||
# Run the benchmark
|
||||
model={{ model.model_repo }}
|
||||
max_concurrency=1
|
||||
num_prompts=10
|
||||
in=128
|
||||
out=128
|
||||
vllm bench serve --model $model \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--dataset-name random \
|
||||
--ignore-eos \
|
||||
--max-concurrency $max_concurrency \
|
||||
--num-prompts $num_prompts \
|
||||
--random-input-len $in \
|
||||
--random-output-len $out \
|
||||
--trust-remote-code \
|
||||
--save-result \
|
||||
--result-filename ${model}_serving.json
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
|
||||
try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
|
||||
|
||||
1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
|
||||
2. Use the following command to build the image directly from the specified commit.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
.. code-block:: shell
|
||||
|
||||
docker build -f docker/Dockerfile.rocm \
|
||||
--build-arg REMOTE_VLLM=1 \
|
||||
--build-arg VLLM_REPO=https://github.com/ROCm/vllm \
|
||||
--build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
|
||||
-t vllm-rocm .
|
||||
|
||||
.. tip::
|
||||
|
||||
Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
a brief introduction to vLLM and optimization strategies.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251024``
|
||||
(latest)
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.11.1
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
|
||||
(latest)
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.10.2
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <vllm-0.10.2-20251006>`
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
|
||||
|
||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. _vllm-benchmark-unified-docker-1024:
|
||||
.. _vllm-benchmark-unified-docker-930:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
@@ -34,7 +34,7 @@ vLLM inference performance testing
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-1024>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-930>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
@@ -42,13 +42,27 @@ What's new
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||
|
||||
* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
|
||||
* Added support for AMD Instinct MI355X and MI350X GPUs.
|
||||
|
||||
* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
|
||||
* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
|
||||
|
||||
* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
|
||||
* Llama 4 Scout and Maverick
|
||||
|
||||
.. _vllm-benchmark-supported-models-1024:
|
||||
* DeepSeek R1 0528 FP8
|
||||
|
||||
* MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
|
||||
|
||||
* GPT OSS 20B and 120B
|
||||
|
||||
* Qwen 3 32B, 30B-A3B, and 235B-A22B
|
||||
|
||||
* Removed the deprecated ``--max-seq-len-to-capture`` flag.
|
||||
|
||||
* ``--gpu-memory-utilization`` is now configurable via the `configuration files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
|
||||
repository.
|
||||
|
||||
.. _vllm-benchmark-supported-models-930:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -58,7 +72,7 @@ Supported models
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-1024:
|
||||
.. _vllm-benchmark-available-models-930:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
@@ -94,7 +108,7 @@ Supported models
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-1024:
|
||||
.. _vllm-benchmark-vllm-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -122,7 +136,7 @@ Supported models
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-1024:
|
||||
.. _vllm-benchmark-performance-measurements-930:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -178,7 +192,7 @@ Benchmarking
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-1024:
|
||||
.. _vllm-benchmark-mad-930:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -190,7 +204,7 @@ Benchmarking
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -219,7 +233,7 @@ Benchmarking
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-1024>` are preconfigured to collect
|
||||
<vllm-benchmark-available-models-930>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
@@ -244,7 +258,7 @@ Benchmarking
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
@@ -405,10 +419,6 @@ Advanced usage
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ See the `GitHub repository <https://github.com/vllm-project/vllm>`_ and `officia
|
||||
<https://docs.vllm.ai/>`_ for more information.
|
||||
|
||||
For guidance on using vLLM with ROCm, refer to `Installation with ROCm
|
||||
<https://docs.vllm.ai/en/stable/getting_started/installation/gpu.html#amd-rocm>`__.
|
||||
<https://docs.vllm.ai/en/latest/getting_started/amd-installation.html>`_.
|
||||
|
||||
vLLM installation
|
||||
-----------------
|
||||
|
||||
@@ -254,7 +254,7 @@ PyTorch training
|
||||
The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
|
||||
with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
|
||||
following example refers to the legacy workflow :ref:`Training a
|
||||
model with PyTorch <amd-pytorch-training-multinode-examples-v259>`.
|
||||
model with PyTorch <amd-pytorch-training-multinode-examples>`.
|
||||
|
||||
1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
|
||||
|
||||
@@ -277,7 +277,7 @@ PyTorch training
|
||||
|
||||
.. seealso::
|
||||
|
||||
See :ref:`Training a model with PyTorch <amd-pytorch-training-multinode-examples-v259>` for more examples and information.
|
||||
See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
|
||||
|
||||
Megatron-LM
|
||||
-----------
|
||||
|
||||
@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
|
||||
.. _healthcheck-install-transferbench:
|
||||
|
||||
1. To get started, use the instructions in the `TransferBench documentation
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
|
||||
<https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
|
||||
or use the following commands:
|
||||
|
||||
.. code:: shell
|
||||
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
|
||||
CC=hipcc make
|
||||
|
||||
2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
|
||||
@@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm
|
||||
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
|
||||
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
|
||||
|
||||
Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
|
||||
To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
|
||||
|
||||
@@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo
|
||||
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
|
||||
including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
|
||||
|
||||
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
|
||||
training <megatron-lm>` workflow. To learn how to migrate workloads from
|
||||
@@ -183,7 +183,7 @@ Configuration
|
||||
=============
|
||||
|
||||
Primus defines a training configuration in YAML for each model in
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/rss/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend.
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
|
||||
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||
:doc:`ROCm PyTorch training <pytorch-training>` workflow. See
|
||||
|
||||
@@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||
including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
|
||||
|
||||
See :doc:`primus-pytorch` for details.
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe
|
||||
|
||||
See the following developer blogs for more in-depth explanations and examples.
|
||||
|
||||
* `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__
|
||||
* `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_Series_multigpu.html>`_
|
||||
|
||||
* `Building a decoder transformer model on AMD GPUs — ROCm Blogs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
|
||||
|
||||
@@ -65,8 +65,6 @@ ROCm documentation is organized into the following categories:
|
||||
* [ROCm libraries](./reference/api-libraries.md)
|
||||
* [ROCm tools, compilers, and runtimes](./reference/rocm-tools.md)
|
||||
* [GPU hardware specifications](./reference/gpu-arch-specs.rst)
|
||||
* [Hardware atomics operation support](./reference/gpu-atomics-operation.rst)
|
||||
* [Environment variables](./reference/env-variables.rst)
|
||||
* [Data types and precision support](./reference/precision-support.rst)
|
||||
* [Graph safe support](./reference/graph-safe-support.rst)
|
||||
<!-- markdownlint-enable MD051 -->
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
.. meta::
|
||||
:description: Environment variables reference
|
||||
:keywords: AMD, ROCm, environment variables, environment, reference, settings
|
||||
|
||||
.. role:: cpp(code)
|
||||
:language: cpp
|
||||
|
||||
.. _env-variables-reference:
|
||||
|
||||
*************************************************************
|
||||
ROCm environment variables
|
||||
*************************************************************
|
||||
|
||||
ROCm provides a set of environment variables that allow users to configure and optimize their development
|
||||
and runtime experience. These variables define key settings such as installation paths, platform selection,
|
||||
and runtime behavior for applications running on AMD accelerators and GPUs.
|
||||
|
||||
This page outlines commonly used environment variables across different components of the ROCm software stack,
|
||||
including HIP and ROCR-Runtime. Understanding these variables can help streamline software development and
|
||||
execution in ROCm-based environments.
|
||||
|
||||
HIP environment variables
|
||||
=========================
|
||||
|
||||
The following tables list the HIP environment variables.
|
||||
|
||||
GPU isolation variables
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/hip/docs/reference/env_variables/gpu_isolation_hip_env.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
|
||||
Profiling variables
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/hip/docs/reference/env_variables/profiling_hip_env.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
|
||||
Debug variables
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/hip/docs/reference/env_variables/debug_hip_env.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
Memory management related variables
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/hip/docs/reference/env_variables/memory_management_hip_env.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
Other useful variables
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/hip/docs/reference/env_variables/miscellaneous_hip_env.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
ROCR-Runtime environment variables
|
||||
==================================
|
||||
|
||||
The following table lists the ROCR-Runtime environment variables:
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/rocm-systems
|
||||
:path: /projects/rocr-runtime/runtime/docs/data/env_variables.rst
|
||||
:default_branch: develop
|
||||
:tag_prefix: docs/
|
||||
|
||||
HIPCC environment variables
|
||||
===========================
|
||||
|
||||
This topic provides descriptions of the HIPCC environment variables.
|
||||
|
||||
.. remote-content::
|
||||
:repo: ROCm/llvm-project
|
||||
:path: amd/hipcc/docs/env.rst
|
||||
:default_branch: amd-staging
|
||||
:start_line: 14
|
||||
:tag_prefix: docs/
|
||||
|
||||
Environment variables in ROCm libraries
|
||||
=======================================
|
||||
|
||||
Many ROCm libraries define environment variables for specific tuning, debugging,
|
||||
or behavioral control. The table below provides an overview and links to further
|
||||
documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 30, 70
|
||||
|
||||
* - Library
|
||||
- Purpose of Environment Variables
|
||||
|
||||
* - :doc:`hipBLASLt <hipblaslt:reference/env-variables>`
|
||||
- Manage logging, debugging, offline tuning, and stream-K configuration
|
||||
for hipBLASLt.
|
||||
|
||||
* - :doc:`hipSPARSELt <hipsparselt:reference/env-variables>`
|
||||
- Control logging, debugging and performance monitoring of hipSPARSELt.
|
||||
|
||||
* - :doc:`rocBLAS <rocblas:reference/env-variables>`
|
||||
- Performance tuning, kernel selection, logging, and debugging for BLAS
|
||||
operations.
|
||||
|
||||
* - :doc:`rocSolver <rocsolver:reference/env_variables>`
|
||||
- Control logging of rocSolver.
|
||||
|
||||
* - :doc:`rocSPARSE <rocsparse:reference/env_variables>`
|
||||
- Control logging of rocSPARSE.
|
||||
|
||||
* - :doc:`MIGraphX <amdmigraphx:reference/MIGraphX-dev-env-vars>`
|
||||
- Control debugging, testing, and model performance tuning options for
|
||||
MIGraphX.
|
||||
|
||||
* - :doc:`MIOpen <miopen:reference/env_variables>`
|
||||
- Control MIOpen logging and debugging, find mode and algorithm behavior
|
||||
and others.
|
||||
|
||||
* - :doc:`MIVisionX <mivisionx:reference/MIVisionX-env-variables>`
|
||||
- Control core OpenVX, GPU/device and debugging/profiling, stitching and
|
||||
chroma key configurations, file I/O operations, model deployment, and
|
||||
neural network parameters of MIVisionX.
|
||||
|
||||
* - :doc:`RCCL <rccl:api-reference/env-variables>`
|
||||
- Control the logging, debugging, compiler and assembly behavior, and
|
||||
cache of RPP.
|
||||
|
||||
* - :doc:`RPP <rpp:reference/rpp-env-variables>`
|
||||
- Logging, debugging, compiler and assembly management, and cache control in RPP
|
||||
|
||||
* - `Tensile <https://rocm.docs.amd.com/projects/Tensile/en/latest/src/reference/environment-variables.html>`_
|
||||
- Enable testing, debugging, and experimental features for Tensile clients and applications
|
||||
|
||||
Key single-variable details
|
||||
===========================
|
||||
|
||||
This section provides detailed descriptions, in the standard format, for ROCm
|
||||
libraries that feature a single, key environment variable (or a very minimal set)
|
||||
which is documented directly on this page for convenience.
|
||||
|
||||
.. _rocalution-vars-detail:
|
||||
|
||||
rocALUTION
|
||||
----------
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 70,30
|
||||
|
||||
* - Environment variable
|
||||
- Value
|
||||
|
||||
* - | ``ROCALUTION_LAYER``
|
||||
| If set to ``1``, enable file logging. Logs each rocALUTION function call including object constructor/destructor, address of the object, memory allocation, data transfers, all function calls for matrices, vectors, solvers, and preconditioners. The log file is placed in the working directory.
|
||||
- | ``1`` (Enable trace file logging)
|
||||
| Default: Not set.
|
||||
@@ -93,7 +93,7 @@ The following table shows whether a ROCm library is graph-safe.
|
||||
- ⚠️ (experimental)
|
||||
*
|
||||
- `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- ❌
|
||||
- ❌ (see :doc:`details <rocthrust:hipgraph-support>`)
|
||||
*
|
||||
- `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- ❌
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
* {doc}`rocminfo <rocminfo:index>`
|
||||
* {doc}`ROCm SMI <rocm_smi_lib:index>`
|
||||
* {doc}`ROCm Validation Suite <rocmvalidationsuite:index>`
|
||||
* {doc}`Cluster Validation Suite <cvs:index>`
|
||||
:::
|
||||
|
||||
:::{grid-item-card} Performance
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
|
||||
| Version | Release date |
|
||||
| ------- | ------------ |
|
||||
| [7.1.1](https://rocm.docs.amd.com/en/docs-7.1.1/) | November 26, 2025 |
|
||||
| [7.1.0](https://rocm.docs.amd.com/en/docs-7.1.0/) | October 30, 2025 |
|
||||
| [7.0.2](https://rocm.docs.amd.com/en/docs-7.0.2/) | October 10, 2025 |
|
||||
| [7.0.1](https://rocm.docs.amd.com/en/docs-7.0.1/) | September 17, 2025 |
|
||||
| [7.0.0](https://rocm.docs.amd.com/en/docs-7.0.0/) | September 16, 2025 |
|
||||
|
||||
@@ -134,8 +134,6 @@ subtrees:
|
||||
title: Profile and debug
|
||||
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||
title: Workload optimization
|
||||
- file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
|
||||
title: vLLM V1 performance optimization
|
||||
|
||||
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||
title: AI tutorials
|
||||
@@ -216,8 +214,6 @@ subtrees:
|
||||
title: ROCm tools, compilers, and runtimes
|
||||
- file: reference/gpu-arch-specs.rst
|
||||
- file: reference/gpu-atomics-operation.rst
|
||||
- file: reference/env-variables.rst
|
||||
title: Environment variables
|
||||
- file: reference/precision-support.rst
|
||||
title: Data types and precision support
|
||||
- file: reference/graph-safe-support.rst
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.30.1
|
||||
rocm-docs-core==1.26.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
# pip-compile docs/sphinx/requirements.in
|
||||
#
|
||||
accessible-pygments==0.0.5
|
||||
# via pydata-sphinx-theme
|
||||
alabaster==1.0.0
|
||||
# via sphinx
|
||||
asttokens==3.0.1
|
||||
asttokens==3.0.0
|
||||
# via stack-data
|
||||
attrs==25.4.0
|
||||
# via
|
||||
@@ -23,21 +23,21 @@ beautifulsoup4==4.14.2
|
||||
# via pydata-sphinx-theme
|
||||
breathe==4.36.0
|
||||
# via rocm-docs-core
|
||||
certifi==2025.11.12
|
||||
certifi==2025.10.5
|
||||
# via requests
|
||||
cffi==2.0.0
|
||||
# via
|
||||
# cryptography
|
||||
# pynacl
|
||||
charset-normalizer==3.4.4
|
||||
charset-normalizer==3.4.3
|
||||
# via requests
|
||||
click==8.3.1
|
||||
click==8.3.0
|
||||
# via
|
||||
# jupyter-cache
|
||||
# sphinx-external-toc
|
||||
comm==0.2.3
|
||||
# via ipykernel
|
||||
cryptography==46.0.3
|
||||
cryptography==46.0.2
|
||||
# via pyjwt
|
||||
debugpy==1.8.17
|
||||
# via ipykernel
|
||||
@@ -50,7 +50,7 @@ docutils==0.21.2
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
exceptiongroup==1.3.1
|
||||
exceptiongroup==1.3.0
|
||||
# via ipython
|
||||
executing==2.2.1
|
||||
# via stack-data
|
||||
@@ -64,7 +64,7 @@ gitpython==3.1.45
|
||||
# via rocm-docs-core
|
||||
greenlet==3.2.4
|
||||
# via sqlalchemy
|
||||
idna==3.11
|
||||
idna==3.10
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
@@ -72,7 +72,7 @@ importlib-metadata==8.7.0
|
||||
# via
|
||||
# jupyter-cache
|
||||
# myst-nb
|
||||
ipykernel==7.1.0
|
||||
ipykernel==6.30.1
|
||||
# via myst-nb
|
||||
ipython==8.37.0
|
||||
# via
|
||||
@@ -94,7 +94,7 @@ jupyter-client==8.6.3
|
||||
# via
|
||||
# ipykernel
|
||||
# nbclient
|
||||
jupyter-core==5.9.1
|
||||
jupyter-core==5.8.1
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@@ -106,7 +106,7 @@ markdown-it-py==3.0.0
|
||||
# myst-parser
|
||||
markupsafe==3.0.3
|
||||
# via jinja2
|
||||
matplotlib-inline==0.2.1
|
||||
matplotlib-inline==0.1.7
|
||||
# via
|
||||
# ipykernel
|
||||
# ipython
|
||||
@@ -137,11 +137,11 @@ parso==0.8.5
|
||||
# via jedi
|
||||
pexpect==4.9.0
|
||||
# via ipython
|
||||
platformdirs==4.5.0
|
||||
platformdirs==4.4.0
|
||||
# via jupyter-core
|
||||
prompt-toolkit==3.0.52
|
||||
# via ipython
|
||||
psutil==7.1.3
|
||||
psutil==7.1.0
|
||||
# via ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
@@ -163,7 +163,7 @@ pygments==2.19.2
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.10.1
|
||||
# via pygithub
|
||||
pynacl==1.6.1
|
||||
pynacl==1.6.0
|
||||
# via pygithub
|
||||
python-dateutil==2.9.0.post0
|
||||
# via jupyter-client
|
||||
@@ -179,7 +179,7 @@ pyzmq==27.1.0
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
referencing==0.37.0
|
||||
referencing==0.36.2
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
@@ -187,9 +187,9 @@ requests==2.32.5
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.30.1
|
||||
# via -r requirements.in
|
||||
rpds-py==0.29.0
|
||||
rocm-docs-core==1.26.0
|
||||
# via -r docs/sphinx/requirements.in
|
||||
rpds-py==0.27.1
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
@@ -230,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8
|
||||
sphinx-notfound-page==1.1.0
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.6
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinx-sitemap==2.9.0
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-datatemplates==0.11.0
|
||||
# via -r requirements.in
|
||||
# via -r docs/sphinx/requirements.in
|
||||
sphinxcontrib-devhelp==2.0.0
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.1.0
|
||||
@@ -249,13 +249,13 @@ sphinxcontrib-runcmd==0.2.0
|
||||
# via sphinxcontrib-datatemplates
|
||||
sphinxcontrib-serializinghtml==2.0.0
|
||||
# via sphinx
|
||||
sqlalchemy==2.0.44
|
||||
sqlalchemy==2.0.43
|
||||
# via jupyter-cache
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
tabulate==0.9.0
|
||||
# via jupyter-cache
|
||||
tomli==2.3.0
|
||||
tomli==2.2.1
|
||||
# via sphinx
|
||||
tornado==6.5.2
|
||||
# via
|
||||
|
||||
@@ -124,26 +124,3 @@
|
||||
#rocm-rn-components:has(tbody.rocm-components-runtimes td:hover) tr:hover > td {
|
||||
background-color: var(--pst-color-table-row-hover-bg);
|
||||
}
|
||||
|
||||
/* Left-align text + vertically center content for any table using this class */
|
||||
.table--middle-left {
|
||||
border-collapse: collapse; /* optional but typical for docs tables */
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.table--middle-left th,
|
||||
.table--middle-left td {
|
||||
text-align: left;
|
||||
vertical-align: middle !important; /* override Bootstrap/Sphinx defaults */
|
||||
padding: 0.5rem; /* optional: adjust to your spacing scale */
|
||||
}
|
||||
|
||||
/* Normalize paragraphs inside cells so margins don't disrupt centering */
|
||||
.table--middle-left th p,
|
||||
.table--middle-left td p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
div.sd-row ul {
|
||||
padding-left: 2rem;
|
||||
}
|
||||
|
||||
@@ -107,6 +107,7 @@ System Management
|
||||
":doc:`rocminfo <rocminfo:index>`", "Reports system information"
|
||||
":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
|
||||
":doc:`ROCm Validation Suite <rocmvalidationsuite:index>`", "Detects and troubleshoots common problems affecting AMD GPUs running in a high-performance computing environment"
|
||||
":doc:`Cluster Validation Suite <cvs:index>`", "CVS is a collection of test scripts that validate AMD AI clusters. Use CVS to verify cluster health, GPU/CPU node health, Host OS configuration checks, and NIC validations."
|
||||
|
||||
Performance
|
||||
^^^^^^^^^^^
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.0"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIPIFY" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<!-- The following components have been migrated to rocm-libraries:
|
||||
hipBLAS-common hipBLAS hipBLASLt hipCUB
|
||||
hipFFT hipRAND hipSPARSE hipSPARSELt
|
||||
MIOpen rocBLAS rocFFT rocPRIM rocRAND
|
||||
rocSPARSE rocThrust Tensile -->
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<!-- The following components have been migrated to rocm-systems:
|
||||
aqlprofile clr hip hip-tests hipother
|
||||
rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute
|
||||
rocprofiler-register rocprofiler-sdk rocprofiler-systems
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
@@ -1,60 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.1"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIPIFY" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<!-- The following components have been migrated to rocm-libraries:
|
||||
hipBLAS-common hipBLAS hipBLASLt hipCUB
|
||||
hipFFT hipRAND hipSPARSE hipSPARSELt
|
||||
MIOpen rocBLAS rocFFT rocPRIM rocRAND
|
||||
rocSPARSE rocThrust Tensile -->
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<!-- The following components have been migrated to rocm-systems:
|
||||
aqlprofile clr hip hip-tests hipother
|
||||
rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute
|
||||
rocprofiler-register rocprofiler-sdk rocprofiler-systems
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
Reference in New Issue
Block a user