mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
24 Commits
mattwill-a
...
docs/7.1.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e98d954af | ||
|
|
ad888e854d | ||
|
|
00df690446 | ||
|
|
b094c2ba1a | ||
|
|
64559321cb | ||
|
|
cd8d27ba40 | ||
|
|
e5c9382825 | ||
|
|
7a7c5b25a6 | ||
|
|
a1a33185b9 | ||
|
|
876da0b68b | ||
|
|
508cb9829e | ||
|
|
5e7a88f769 | ||
|
|
503d4491de | ||
|
|
a43cde5bd8 | ||
|
|
5a1494b7f5 | ||
|
|
b548b0edec | ||
|
|
472afbfa6f | ||
|
|
afd388aafb | ||
|
|
59135e73cd | ||
|
|
c4483379c6 | ||
|
|
fc3091be75 | ||
|
|
dfa04e5c21 | ||
|
|
2024aa839e | ||
|
|
45ab654f39 |
@@ -128,9 +128,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.28.6'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -155,7 +152,6 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
|
||||
-DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
|
||||
@@ -196,9 +192,6 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.28.6'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -224,7 +217,6 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
|
||||
-DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: amdsmi
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -50,7 +31,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
- job: amdsmi_build_${{ job.os }}
|
||||
pool:
|
||||
${{ if eq(job.os, 'ubuntu2404') }}:
|
||||
vmImage: 'ubuntu-24.04'
|
||||
@@ -74,7 +55,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
@@ -85,54 +65,50 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
componentName: ${{ parameters.componentName }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
# parameters:
|
||||
# aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: amdsmi_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: amdsmi_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: amdsmi
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: hipTensor
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -70,7 +51,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
- job: hipTensor_build_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -85,15 +66,12 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
extraBuildFlags: >-
|
||||
@@ -107,12 +85,9 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -120,47 +95,44 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
timeoutInMinutes: 90
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
|
||||
testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: hipTensor_test_${{ job.target }}
|
||||
timeoutInMinutes: 90
|
||||
dependsOn: hipTensor_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: hipTensor
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
|
||||
testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -1,29 +1,10 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocWMMA
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -85,11 +66,7 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
- job: rocWMMA_build_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -104,7 +81,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
@@ -126,12 +102,9 @@ jobs:
|
||||
# gfx1030 not supported in documentation
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -139,45 +112,43 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
timeoutInMinutes: 350
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocWMMA_test_${{ job.target }}
|
||||
timeoutInMinutes: 270
|
||||
dependsOn: rocWMMA_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocWMMA
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -21,26 +21,11 @@ parameters:
|
||||
- libtbb-dev
|
||||
- libtiff-dev
|
||||
- libva-amdgpu-dev
|
||||
- libva2-amdgpu
|
||||
- mesa-amdgpu-va-drivers
|
||||
- libavcodec-dev
|
||||
- libavformat-dev
|
||||
- libavutil-dev
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- protobuf-compiler
|
||||
- libprotoc-dev
|
||||
- libopencv-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
- future==1.0.0
|
||||
- pytz==2022.1
|
||||
- numpy==1.23
|
||||
- google==3.0.0
|
||||
- protobuf==3.12.4
|
||||
- onnx==1.12.0
|
||||
- nnef==1.0.7
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -48,7 +33,6 @@ parameters:
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
@@ -63,9 +47,6 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocAL
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
@@ -88,7 +69,6 @@ parameters:
|
||||
- aomp
|
||||
- aomp-extras
|
||||
- clr
|
||||
- half
|
||||
- composable_kernel
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
@@ -103,9 +83,6 @@ parameters:
|
||||
- llvm-project
|
||||
- MIOpen
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocAL
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
@@ -151,7 +128,6 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
@@ -251,6 +227,5 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -65,13 +65,6 @@ parameters:
|
||||
- pytest
|
||||
- pytest-cov
|
||||
- pytest-xdist
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-sdk
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -108,12 +101,10 @@ jobs:
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
||||
workspace:
|
||||
@@ -128,14 +119,6 @@ jobs:
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
extraBuildFlags: >-
|
||||
|
||||
@@ -63,7 +63,6 @@ parameters:
|
||||
libopenblas-dev: openblas-devel
|
||||
libopenmpi-dev: openmpi-devel
|
||||
libpci-dev: libpciaccess-devel
|
||||
libsimde-dev: simde-devel
|
||||
libssl-dev: openssl-devel
|
||||
# note: libstdc++-devel is in the base packages list
|
||||
libsystemd-dev: systemd-devel
|
||||
|
||||
@@ -35,8 +35,8 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
amdsmi:
|
||||
pipelineId: 376
|
||||
developBranch: develop
|
||||
pipelineId: 99
|
||||
developBranch: amd-staging
|
||||
hasGpuTarget: false
|
||||
aomp-extras:
|
||||
pipelineId: 111
|
||||
@@ -115,7 +115,7 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
hipTensor:
|
||||
pipelineId: 374
|
||||
pipelineId: 105
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
llvm-project:
|
||||
@@ -263,7 +263,7 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocWMMA:
|
||||
pipelineId: 370
|
||||
pipelineId: 109
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rpp:
|
||||
|
||||
@@ -36,6 +36,7 @@ Andrej
|
||||
Arb
|
||||
Autocast
|
||||
autograd
|
||||
Backported
|
||||
BARs
|
||||
BatchNorm
|
||||
BLAS
|
||||
@@ -79,7 +80,6 @@ CX
|
||||
Cavium
|
||||
CentOS
|
||||
ChatGPT
|
||||
Cholesky
|
||||
CoRR
|
||||
Codespaces
|
||||
Commitizen
|
||||
@@ -138,7 +138,6 @@ ESXi
|
||||
EP
|
||||
EoS
|
||||
etcd
|
||||
equalto
|
||||
fas
|
||||
FBGEMM
|
||||
FiLM
|
||||
@@ -203,9 +202,11 @@ GenAI
|
||||
GenZ
|
||||
GitHub
|
||||
Gitpod
|
||||
hardcoded
|
||||
HBM
|
||||
HCA
|
||||
HGX
|
||||
HLO
|
||||
HIPCC
|
||||
hipDataType
|
||||
HIPExtension
|
||||
@@ -227,8 +228,6 @@ href
|
||||
Hyperparameters
|
||||
HybridEngine
|
||||
Huggingface
|
||||
Hunyuan
|
||||
HunyuanVideo
|
||||
IB
|
||||
ICD
|
||||
ICT
|
||||
@@ -280,7 +279,6 @@ LLM
|
||||
LLMs
|
||||
LLVM
|
||||
LM
|
||||
logsumexp
|
||||
LRU
|
||||
LSAN
|
||||
LSan
|
||||
@@ -332,8 +330,8 @@ MoEs
|
||||
Mooncake
|
||||
Mpops
|
||||
Multicore
|
||||
multihost
|
||||
Multithreaded
|
||||
mx
|
||||
MXFP
|
||||
MyEnvironment
|
||||
MyST
|
||||
@@ -520,8 +518,6 @@ TPS
|
||||
TPU
|
||||
TPUs
|
||||
TSME
|
||||
Taichi
|
||||
Taichi's
|
||||
Tagram
|
||||
TensileLite
|
||||
TensorBoard
|
||||
@@ -544,7 +540,6 @@ UAC
|
||||
UC
|
||||
UCC
|
||||
UCX
|
||||
ud
|
||||
UE
|
||||
UIF
|
||||
UMC
|
||||
@@ -856,7 +851,6 @@ pallas
|
||||
parallelization
|
||||
parallelizing
|
||||
param
|
||||
params
|
||||
parameterization
|
||||
passthrough
|
||||
pe
|
||||
@@ -903,7 +897,6 @@ querySelectorAll
|
||||
queueing
|
||||
qwen
|
||||
radeon
|
||||
rc
|
||||
rccl
|
||||
rdc
|
||||
rdma
|
||||
@@ -965,7 +958,6 @@ scalability
|
||||
scalable
|
||||
scipy
|
||||
seealso
|
||||
selectattr
|
||||
selectedTag
|
||||
sendmsg
|
||||
seqs
|
||||
@@ -1027,6 +1019,7 @@ uncacheable
|
||||
uncorrectable
|
||||
underoptimized
|
||||
unhandled
|
||||
unfused
|
||||
uninstallation
|
||||
unmapped
|
||||
unsqueeze
|
||||
@@ -1069,8 +1062,6 @@ writebacks
|
||||
wrreq
|
||||
wzo
|
||||
xargs
|
||||
xdit
|
||||
xDiT
|
||||
xGMI
|
||||
xPacked
|
||||
xz
|
||||
|
||||
192
CHANGELOG.md
192
CHANGELOG.md
@@ -4,194 +4,6 @@ This page is a historical overview of changes made to ROCm components. This
|
||||
consolidated changelog documents key modifications and improvements across
|
||||
different versions of the ROCm software stack and its components.
|
||||
|
||||
## ROCm 7.1.1
|
||||
|
||||
See the [ROCm 7.1.1 release notes](https://rocm.docs.amd.com/en/docs-7.1.1/about/release-notes.html#rocm-7-1-1-release-notes)
|
||||
for a complete overview of this release.
|
||||
|
||||
### **AMD SMI** (26.2.0)
|
||||
|
||||
#### Added
|
||||
|
||||
- Caching for repeated ASIC information calls.
|
||||
- The cache added to `amdsmi_get_gpu_asic_info` improves performance by avoiding redundant hardware queries.
|
||||
- The cache stores ASIC info for each GPU device with a configurable duration, defaulting to 10 seconds. Use the `AMDSMI_ASIC_INFO_CACHE_MS` environment variable for cache duration configuration for `amdsmi_get_gpu_asic_info` API calls.
|
||||
|
||||
- Support for GPU partition metrics.
|
||||
- Provides support for `xcp_metrics` v1.0 and extends support for v1.1 (dynamic metrics).
|
||||
- Added `amdsmi_get_gpu_partition_metrics_info`, which provides per XCP (partition) metrics.
|
||||
|
||||
- Support for displaying newer VRAM memory types in `amd-smi static --vram`.
|
||||
- The `amdsmi_get_gpu_vram_info()` API now supports detecting DDR5, LPDDR4, LPDDR5, and HBM3E memory types.
|
||||
|
||||
#### Changed
|
||||
|
||||
- Updated `amd-smi static --numa` socket affinity data structure. It now displays CPU affinity information in both hexadecimal bitmask format and expanded CPU core ranges, replacing the previous simplified socket enumeration approach.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed incorrect topology weight calculations.
|
||||
- Out-of-bound writes caused corruption in the weights field.
|
||||
|
||||
- Fixed `amd-smi event` not respecting the Linux timeout command.
|
||||
|
||||
- Fixed an issue where `amdsmi_get_power_info` returned `AMDSMI_STATUS_API_FAILED`.
|
||||
- VMs were incorrectly reporting `AMDSMI_STATUS_API_FAILED` when unable to get the power cap within the `amdsmi_get_power_info`.
|
||||
- The API now returns `N/A` or `UINT_MAX` for values that can't be retrieved, instead of failing.
|
||||
|
||||
- Fixed output for `amd-smi xgmi -l --json`.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-711) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* Composable Kernel will adopt C++20 features in an upcoming ROCm release, updating the minimum compiler requirement to C++20. Ensure that your development environment meets this requirement to facilitate a seamless transition.
|
||||
|
||||
### **HIP** (7.1.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for the flag `hipHostRegisterIoMemory` in `hipHostRegister`, used to register I/O memory with HIP runtime so the GPU can access it.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Incorrect Compute Unit (CU) mask in logging. HIP runtime now correctly sets the field width for the output print operation. When logging is enabled via the environment variable `AMD_LOG_LEVEL`, the runtime logs the accurate CU mask.
|
||||
* A segmentation fault occurred when the dynamic queue management mechanism was enabled. HIP runtime now ensures GPU queues aren't `NULL` during marker submission, preventing crashes and improving robustness.
|
||||
* An error encountered on HIP tear-down after device reset in certain applications due to accessing stale memory objects. HIP runtime now properly releases memory associated with host calls, ensuring reliable device resets.
|
||||
* A race condition occurred in certain graph-related applications when pending asynchronous signal handlers referenced device memory that had already been released, leading to memory corruption. HIP runtime now uses a reference counting strategy to manage access to device objects in asynchronous event handlers, ensuring safe and reliable memory usage.
|
||||
|
||||
### **MIGraphX** (2.14.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed an error that resulted when running `make check` on systems running on a gfx1201 GPU.
|
||||
|
||||
### **RCCL** (2.27.7)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed a single-node data corruption issue in MSCCL on the AMD Instinct MI350X and MI355X GPUs for the LL protocol. This previously affected about two percent of the runs for single-node `AllReduce` with inputs smaller than 512 KiB.
|
||||
|
||||
### **rocBLAS** (5.1.1)
|
||||
|
||||
#### Changed
|
||||
* By default, rocBLAS will not use stream order allocation for its internal workspace. To enable this behavior, set the `ROCBLAS_STREAM_ORDER_ALLOC` environment variable.
|
||||
|
||||
### **ROCm Bandwidth Test** (2.6.0)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Test failure with error message `Cannot make canonical path`.
|
||||
- Healthcheck test failure with seg fault on gfx942.
|
||||
- Segmentation fault observed in `schmoo` and `one2all` when executed on `sgpu` setup.
|
||||
|
||||
#### Known issues
|
||||
|
||||
- `rocm-bandwidth-test` folder fails to be removed after driver uninstallation:
|
||||
* After running `amdgpu-uninstall`, the `rocm-bandwidth-test` folder and package are still present.
|
||||
* Workaround: Remove the package manually using:
|
||||
```
|
||||
sudo apt-get remove -y rocm-bandwidth-test
|
||||
```
|
||||
|
||||
### **ROCm Compute Profiler** (3.3.1)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for PC sampling of multi-kernel applications.
|
||||
* PC Sampling output instructions are displayed with the name of the kernel to which the individual instruction belongs.
|
||||
* Single kernel selection is supported so that the PC samples of the selected kernel can be displayed.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Roofline analysis now runs on GPU 0 by default instead of all GPUs.
|
||||
|
||||
#### Optimized
|
||||
|
||||
* Improved roofline benchmarking by updating the `flops_benchmark` calculation.
|
||||
|
||||
* Improved standalone roofline plots in profile mode (PDF output) and analyze mode (CLI and GUI visual plots):
|
||||
* Fixed the peak MFMA/VALU lines being cut off.
|
||||
* Cleaned up the overlapping roofline numeric values by moving them into the side legend.
|
||||
* Added AI points chart with respective values, cache level, and compute/memory bound status.
|
||||
* Added full kernel names to the symbol chart.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Resolved existing issues to improve stability.
|
||||
|
||||
### **ROCm Systems Profiler** (1.2.1)
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
- Fixed an issue of OpenMP Tools (OMPT) events, GPU performance counters, VA-API, MPI, and host events failing to be collected in the `rocpd` output.
|
||||
|
||||
### **ROCm Validation Suite** (1.3.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Support for different test levels with `-r` option for AMD Instinct MI3XXx GPUs.
|
||||
* Set compute type for DGEMM operations on AMD Instinct MI350X and MI355X GPUs.
|
||||
|
||||
### **rocSHMEM** (3.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* Allowed IPC, RO, and GDA backends to be selected at runtime.
|
||||
* GDA conduit for different NIC vendors:
|
||||
* Broadcom BNXT\_RE (Thor 2)
|
||||
* Mellanox MLX5 (IB and RoCE ConnectX-7)
|
||||
* New APIs:
|
||||
* `rocshmem_get_device_ctx`
|
||||
|
||||
#### Changed
|
||||
|
||||
* The following APIs have been deprecated:
|
||||
* `rocshmem_wg_init`
|
||||
* `rocshmem_wg_finalize`
|
||||
* `rocshmem_wg_init_thread`
|
||||
|
||||
* `rocshmem_ptr` can now return non-null pointer to a shared memory region when the IPC transport is available to reach that region. Previously, it would return a null pointer.
|
||||
* `ROCSHMEM_RO_DISABLE_IPC` is renamed to `ROCSHMEM_DISABLE_MIXED_IPC`.
|
||||
- This environment variable wasn't documented in earlier releases. It's now documented.
|
||||
|
||||
#### Removed
|
||||
|
||||
* rocSHMEM no longer requires rocPRIM and rocThrust as dependencies.
|
||||
* Removed MPI compile-time dependency.
|
||||
|
||||
#### Known issues
|
||||
|
||||
* Only a subset of rocSHMEM APIs are implemented for the GDA conduit.
|
||||
|
||||
### **rocWMMA** (2.1.0)
|
||||
|
||||
#### Added
|
||||
|
||||
* More unit tests to increase the code coverage.
|
||||
|
||||
#### Changed
|
||||
|
||||
* Increased compile timeout and improved visualization in `math-ci`.
|
||||
|
||||
#### Removed
|
||||
|
||||
* Absolute paths from the `RPATH` of sample and test binary files.
|
||||
|
||||
#### Resolved issues
|
||||
|
||||
* Fixed issues caused by HIP changes:
|
||||
* Removed the `.data` member from `HIP_vector_type`.
|
||||
* Broadcast constructor now only writes to the first vector element.
|
||||
* Fixed a bug related to `int32_t` usage in `hipRTC_gemm` for gfx942, caused by breaking changes in HIP.
|
||||
* Replaced `#pragma unroll` with `static for` to fix a bug caused by the upgraded compiler which no longer supports using `#pragma unroll` with template parameter indices.
|
||||
* Corrected test predicates for `BLK` and `VW` cooperative kernels.
|
||||
* Modified `compute_utils.sh` in `build-infra` to ensure rocWMMA is built with gfx1151 target for ROCm 7.0 and beyond.
|
||||
|
||||
## ROCm 7.1.0
|
||||
|
||||
See the [ROCm 7.1.0 release notes](https://rocm.docs.amd.com/en/docs-7.1.0/about/release-notes.html#rocm-7-1-0-release-notes)
|
||||
@@ -1734,8 +1546,8 @@ HIP runtime has the following functional improvements which improves runtime per
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](#amdgpu-wavefront-size-compiler-macro-deprecation).
|
||||
* The `roc-obj-ls` and `roc-obj-extract` tools are deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](#changes-to-rocm-object-tooling).
|
||||
* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#amdgpu-wavefront-size-compiler-macro-deprecation).
|
||||
* The `roc-obj-ls` and `roc-obj-extract` tools are deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#changes-to-rocm-object-tooling).
|
||||
|
||||
### **MIGraphX** (2.13.0)
|
||||
|
||||
|
||||
1438
RELEASE.md
1438
RELEASE.md
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.1"
|
||||
<default revision="refs/tags/rocm-7.1.0"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
@@ -25,7 +25,6 @@
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
@@ -46,7 +45,6 @@
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
|
||||
@@ -25,69 +25,69 @@ additional licenses. Please review individual repositories for more information.
|
||||
<!-- spellcheck-disable -->
|
||||
| Component | License |
|
||||
|:---------------------|:-------------------------|
|
||||
| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/LICENSE.md) |
|
||||
| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENSE.txt) |
|
||||
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
|
||||
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
|
||||
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
|
||||
| [AQLprofile](https://github.com/ROCm/rocm-systems/tree/develop/projects/aqlprofile/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/aqlprofile/LICENSE.md) |
|
||||
| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
|
||||
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
|
||||
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
|
||||
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
|
||||
| [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/hip/LICENSE.md) |
|
||||
| [hipamd](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/hipamd/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/hipamd/LICENSE.md) |
|
||||
| [hipBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblas/LICENSE.md) |
|
||||
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
||||
| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/amd-staging/LICENSE.txt) |
|
||||
| [hipamd](https://github.com/ROCm/clr/tree/amd-staging/hipamd) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/hipamd/LICENSE.txt) |
|
||||
| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
|
||||
| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
|
||||
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
||||
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
||||
| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
|
||||
| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
|
||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
||||
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
||||
| [hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiprand/LICENSE.md) |
|
||||
| [hipSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsolver/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparse/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparselt/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiptensor/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiptensor/LICENSE) |
|
||||
| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
|
||||
| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
|
||||
| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
|
||||
| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
|
||||
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
|
||||
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
|
||||
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
||||
| [MIOpen](https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
|
||||
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
|
||||
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocblas/LICENSE.md) |
|
||||
| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
|
||||
| [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
|
||||
| [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
|
||||
| [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/LICENSE.md) |
|
||||
| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
|
||||
| [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v3.0](https://github.com/ROCm/ROCgdb/blob/amd-staging/COPYING3) |
|
||||
| [rocJPEG](https://github.com/ROCm/rocJPEG/) | [MIT](https://github.com/ROCm/rocJPEG/blob/develop/LICENSE) |
|
||||
| [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
|
||||
| [rocminfo](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocminfo/License.txt) |
|
||||
| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
|
||||
| [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [MIT](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
|
||||
| [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
|
||||
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
||||
| [ROCm-Core](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-core/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-core/LICENSE.md) |
|
||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-compute/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute/LICENSE.md) |
|
||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rocm-systems/tree/develop/projects/rdc/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rdc/LICENSE.md) |
|
||||
| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
|
||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
|
||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/opencl/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/opencl/LICENSE.md) |
|
||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
|
||||
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
|
||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-smi-lib/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-smi-lib/LICENSE.md) |
|
||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-systems/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-systems/LICENSE.md) |
|
||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocprim/LICENSE.md) |
|
||||
| [ROCProfiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler/LICENSE.md) |
|
||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-sdk/LICENSE.md) |
|
||||
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
|
||||
| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
|
||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
|
||||
| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
|
||||
| [rocRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocrand/LICENSE.md) |
|
||||
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
|
||||
| [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocr-runtime/LICENSE.txt) |
|
||||
| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/amd-staging/LICENSE.txt) |
|
||||
| [rocSHMEM](https://github.com/ROCm/rocSHMEM/) | [MIT](https://github.com/ROCm/rocSHMEM/blob/develop/LICENSE.md) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver/) | [BSD-2-Clause](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsolver/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsparse/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust/) | [Apache 2.0](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocthrust/LICENSE) |
|
||||
| [ROCTracer](https://github.com/ROCm/rocm-systems/tree/develop/projects/roctracer/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/roctracer/LICENSE.md) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocwmma/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocwmma/LICENSE.md) |
|
||||
| [Tensile](https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/shared/tensile/LICENSE.md) |
|
||||
| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
|
||||
| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
|
||||
| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
|
||||
| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
|
||||
| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
|
||||
| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
|
||||
| [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |
|
||||
|
||||
Open sourced ROCm components are released via public GitHub
|
||||
|
||||
@@ -1,137 +1,136 @@
|
||||
ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
|
||||
,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
|
||||
,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
|
||||
,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
|
||||
,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
ROCm Version,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
|
||||
,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
|
||||
,"RHEL 10.0 [#rhel-10-702-past-60]_, 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 10.0 [#rhel-10-702-past-60]_, 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
|
||||
,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
|
||||
,SLES 15 SP7 [#sles-710-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
|
||||
,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8 [#ol-710-mi300x-past-60]_","Oracle Linux 10, 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
|
||||
,"Debian 13 [#db-710-mi300x-past-60]_, 12 [#db12-710-past-60]_","Debian 13 [#db-mi300x-past-60]_, 12 [#sles-db-700-past-60]_",Debian 12 [#sles-db-700-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
|
||||
,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
|
||||
,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
|
||||
,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
|
||||
,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
|
||||
,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
|
||||
,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-710-past-60]_,gfx950 [#mi350x-os-700-past-60]_,gfx950 [#mi350x-os-700-past-60]_,,,,,,,,,,,,,,,,,,
|
||||
,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
|
||||
,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
|
||||
,gfx942 [#mi325x-os-710past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
|
||||
,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
|
||||
,gfx908 [#mi100-710-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
Thrust,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
CUB,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
|
||||
:doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
|
||||
:doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
|
||||
`rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
|
||||
:doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
|
||||
,,,,,,,,,,,,,,,,,,,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
:doc:`HIP <hip:index>`,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
|
||||
|
||||
|
@@ -22,18 +22,18 @@ compatibility and system requirements.
|
||||
.. container:: format-big-table
|
||||
|
||||
.. csv-table::
|
||||
:header: "ROCm Version", "7.1.1", "7.1.0", "6.4.0"
|
||||
:header: "ROCm Version", "7.1.0", "7.0.2", "6.4.0"
|
||||
:stub-columns: 1
|
||||
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
:ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
|
||||
,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
|
||||
,"RHEL 10.1, 10.0, 9.7, |br| 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10,RHEL 8.10,RHEL 8.10
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
|
||||
,"Debian 13, 12","Debian 13, 12",Debian 12
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0
|
||||
,Rocky Linux 9,Rocky Linux 9,
|
||||
,"RHEL 10.0 [#rhel-10-702]_, 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 10.0 [#rhel-10-702]_, 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.5, 9.4"
|
||||
,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700]_,RHEL 8.10
|
||||
,SLES 15 SP7 [#sles-710]_,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8 [#ol-710-mi300x]_","Oracle Linux 10, 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_"
|
||||
,"Debian 13 [#db-710-mi300x]_, 12 [#db12-710]_","Debian 13 [#db-mi300x]_, 12 [#sles-db-700]_",Debian 12 [#single-node]_
|
||||
,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_
|
||||
,Rocky Linux 9 [#rl-700]_,Rocky Linux 9 [#rl-700]_,
|
||||
,.. _architecture-support-compatibility-matrix:,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
|
||||
,CDNA3,CDNA3,CDNA3
|
||||
@@ -43,20 +43,20 @@ compatibility and system requirements.
|
||||
,RDNA3,RDNA3,RDNA3
|
||||
,RDNA2,RDNA2,RDNA2
|
||||
,.. _gpu-support-compatibility-matrix:,,
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
|
||||
,gfx1201,gfx1201,
|
||||
,gfx1200,gfx1200,
|
||||
,gfx1101,gfx1101,
|
||||
,gfx1100,gfx1100,gfx1100
|
||||
,gfx1030,gfx1030,gfx1030
|
||||
,gfx942,gfx942,gfx942
|
||||
,gfx90a,gfx90a,gfx90a
|
||||
,gfx908,gfx908,gfx908
|
||||
:doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-710]_,gfx950 [#mi350x-os-700]_,
|
||||
,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS-700]_,
|
||||
,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS-700]_,
|
||||
,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,
|
||||
,gfx1100 [#RDNA-OS-700]_,gfx1100 [#RDNA-OS-700]_,gfx1100
|
||||
,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030
|
||||
,gfx942 [#mi325x-os-710]_ [#mi300x-os]_ [#mi300A-os]_,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942
|
||||
,gfx90a [#mi200x-os]_,gfx90a [#mi200x-os]_,gfx90a
|
||||
,gfx908 [#mi100-710-os]_,gfx908 [#mi100-os]_,gfx908
|
||||
,,,
|
||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.18.1, 2.17.1, 2.16.2"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.6.0,0.4.35
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
|
||||
@@ -66,71 +66,71 @@ compatibility and system requirements.
|
||||
`UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
|
||||
,,,
|
||||
THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
|
||||
Thrust,2.8.5,2.8.5,2.5.0
|
||||
CUB,2.8.5,2.8.5,2.5.0
|
||||
Thrust,2.8.5,2.6.0,2.5.0
|
||||
CUB,2.8.5,2.6.0,2.5.0
|
||||
,,,
|
||||
DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
:doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
|
||||
,,,
|
||||
ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
|
||||
:doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.1.0,1.9.10
|
||||
:doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.13.0,2.12.0
|
||||
:doc:`MIOpen <miopen:index>`,3.5.1,3.5.0,3.4.0
|
||||
:doc:`MIVisionX <mivisionx:index>`,3.4.0,3.3.0,3.2.0
|
||||
:doc:`rocAL <rocal:index>`,2.4.0,2.3.0,2.2.0
|
||||
:doc:`rocDecode <rocdecode:index>`,1.4.0,1.0.0,0.10.0
|
||||
:doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.1.0,0.8.0
|
||||
:doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.6.0,0.3.1
|
||||
:doc:`RPP <rpp:index>`,2.1.0,2.0.0,1.9.10
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.26.6,2.22.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`hipBLAS <hipblas:index>`,3.1.0,3.0.2,2.4.0
|
||||
:doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.0.0,0.12.0
|
||||
:doc:`hipFFT <hipfft:index>`,1.0.21,1.0.20,1.0.18
|
||||
:doc:`hipfort <hipfort:index>`,0.7.1,0.7.0,0.6.0
|
||||
:doc:`hipRAND <hiprand:index>`,3.1.0,3.0.0,2.12.0
|
||||
:doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.0.0,2.4.0
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.0.1,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.4,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.0,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.0.2,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.34,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.0.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.30.1,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.0.2,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`hipCUB <hipcub:index>`,4.1.0,4.0.0,3.4.0
|
||||
:doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocPRIM <rocprim:index>`,4.1.0,4.0.1,3.4.0
|
||||
:doc:`rocThrust <rocthrust:index>`,4.1.0,4.0.0,3.3.0
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,6.4.0
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.25424,7.0.51831,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.0.2,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.0.2,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.1.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.2.3,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.1.1,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70100,2.0.70002,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70100,4.1.70002,4.1.60400
|
||||
,,,
|
||||
DEVELOPMENT TOOLS,,,
|
||||
:doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
|
||||
@@ -143,20 +143,46 @@ compatibility and system requirements.
|
||||
COMPILERS,.. _compilers-support-compatibility-matrix:,,
|
||||
`clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
|
||||
:doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,19.0.0.25133
|
||||
`Flang <https://github.com/ROCm/flang>`_,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
:doc:`llvm-project <llvm-project:index>`,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
`OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025425,20.0.0.25385,19.0.0.25133
|
||||
,,,
|
||||
RUNTIMES,.. _runtime-support-compatibility-matrix:,,
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.52802,7.1.25424,6.4.43482
|
||||
:doc:`AMD CLR <hip:understand/amd_clr>`,7.1.25424,7.0.51831,6.4.43482
|
||||
:doc:`HIP <hip:index>`,7.1.25424,7.0.51831,6.4.43482
|
||||
`OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
|
||||
:doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
|
||||
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`_.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`_.
|
||||
.. [#rhel-10-702] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
|
||||
.. [#rhel-94-702] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
|
||||
.. [#rhel-700] RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#sles-710] **For ROCm 7.1.x** - SLES 15 SP7 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#sles-db-700] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#ol-710-mi300x] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-710-mi300x] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
|
||||
.. [#db12-710] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
|
||||
.. [#rl-700] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
|
||||
.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
|
||||
.. [#mi350x-os-710] AMD Instinct MI355X (gfx950) and MI350X (gfx950) GPUs supports all listed :ref:`supported_distributions` except RHEL 8.10, SLES 15 SP7, Debian 12, Debian 13, Rocky 9, Azure Linux 3.0, and Oracle Linux 8.
|
||||
.. [#mi350x-os-700] AMD Instinct MI355X (gfx950) and MI350X (gfx950) GPUs only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
|
||||
.. [#RDNA-OS-700] **For ROCm 7.0.x** - AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, and RHEL 9.6.
|
||||
.. [#rd-v710] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) GPUs only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
|
||||
.. [#rd-v620] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) GPUs only supports Ubuntu 24.04.3 and Ubuntu 22.04.5.
|
||||
.. [#mi325x-os-710] **For ROCm 7.1.x** - AMD Instinct MI325X GPUs (gfx942) supports all listed :ref:`supported_distributions` except RHEL 8.10, Rocky 9, Azure Linux 3.0, and Oracle Linux 8.
|
||||
.. [#mi325x-os] **For ROCm 7.0.x** - AMD Instinct MI325X GPUs (gfx942) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#mi300x-os] **Starting ROCm 7.0.x** - AMD Instinct MI300X GPUs (gfx942) supports all listed :ref:`supported_distributions`.
|
||||
.. [#mi300A-os] **Starting ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) supports all listed :ref:`supported_distributions` except on Debian 13, Azure Linux 3.0, Oracle Linux 10, Oracle Linux 9, and Oracle Linux 8.
|
||||
.. [#mi200x-os] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
|
||||
.. [#mi100-710-os] **For ROCM 7.1.x** - AMD Instinct MI100 GPUs (gfx908) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, and SLES 15 SP7.
|
||||
.. [#mi100-os] **For ROCm 7.0.x** - AMD Instinct MI100 GPUs (gfx908) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
|
||||
.. [#tf-mi350] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
@@ -164,6 +190,7 @@ compatibility and system requirements.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
|
||||
.. _OS-kernel-versions:
|
||||
|
||||
Operating systems, kernel and Glibc versions
|
||||
@@ -182,11 +209,9 @@ Use this lookup table to confirm which operating system and kernel versions are
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.1, 6.12.0-124, 2.39
|
||||
,10.0, 6.12.0-55, 2.39
|
||||
`Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.0, 6.12.0-55, 2.39
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.7, 5.14.0-611, 2.34
|
||||
,9.6, 5.14.0-570, 2.34
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
|
||||
,9.5, 5.14+, 2.34
|
||||
,9.4, 5.14.0-427, 2.34
|
||||
,,
|
||||
@@ -238,14 +263,49 @@ Expand for full historical view of:
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
|
||||
.. [#rhel-10-702-past-60] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
|
||||
.. [#rhel-94-702-past-60] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
|
||||
.. [#rhel-700-past-60] **For ROCm 7.0.x** - RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#sles-710-past-60] **For ROCm 7.1.x** - SLES 15 SP7 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
|
||||
.. [#sles-db-700-past-60] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#ol-710-mi300x-past-60] **For ROCm 7.1.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
|
||||
.. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#db-710-mi300x-past-60] **For ROCm 7.1.x** - Debian 13 is supported only on AMD Instinct MI325X and MI300X GPUs.
|
||||
.. [#db12-710-past-60] **For ROCm 7.1.x** - Debian 12 is supported only on AMD Instinct MI325X, MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
|
||||
.. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
|
||||
.. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
|
||||
.. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X GPUs.
|
||||
.. [#rl-700-past-60] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
|
||||
.. [#mi350x-os-710-past-60] **For ROCm 7.1.x** - AMD Instinct MI355X (gfx950) and MI350X (gfx950) GPUs supports all listed :ref:`supported_distributions` except RHEL 8.10, SLES 15 SP7, Debian 12, Rocky 9, Azure Linux 3.0, and Oracle Linux 8.
|
||||
.. [#mi350x-os-700-past-60] **For ROCm 7.0.x** - AMD Instinct MI355X (gfx950) and MI350X (gfx950) GPUs only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
|
||||
.. [#RDNA-OS-700-past-60] **Starting ROCm 7.0.x** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#RDNA-OS-past-60] **Prior ROCm 7.0.0** - Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) only supports Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#rd-v710-past-60] **Starting ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
|
||||
.. [#rd-v620-past-60] **Starting ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) only supports Ubuntu 24.04.3 and Ubuntu 22.04.5.
|
||||
.. [#mi325x-os-710past-60] **For ROCm 7.1.x** - AMD Instinct MI325X GPU (gfx942) supports all listed :ref:`supported_distributions` except RHEL 8.10, Rocky 9, Azure Linux 3.0, and Oracle Linux 8.
|
||||
.. [#mi325x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI325X GPU (gfx942) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
|
||||
.. [#mi300x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300X GPU (gfx942) supports all listed :ref:`supported_distributions`.
|
||||
.. [#mi300A-os-past-60] **Starting ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) supports all listed :ref:`supported_distributions` except on Debian 13, Azure Linux 3.0, Oracle Linux 10, Oracle Linux 9, and Oracle Linux 8.
|
||||
.. [#mi200x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
|
||||
.. [#mi100-710-os-past-60] **For ROCM 7.1.x** - AMD Instinct MI100 GPUs (gfx908) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, and SLES 15 SP7.
|
||||
.. [#mi100-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI100 GPU (gfx908) only supports Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
|
||||
.. [#7700XT-OS-past-60] **Prior to ROCm 7.0.0** - Radeon RX 7700 XT (gfx1101) only supports Ubuntu 24.04.2 and RHEL 9.6.
|
||||
.. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
|
||||
.. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
|
||||
.. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4.
|
||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
|
||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
|
||||
.. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#verl_compat-past-60] verl is supported only on ROCm 7.0.0 and 6.2.0.
|
||||
.. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
|
||||
.. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
|
||||
.. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
|
||||
|
||||
@@ -58,7 +58,7 @@ between JAX Plugin–PJRT and JAX/JAXLIB.
|
||||
- ROCm
|
||||
* - 0.7.1
|
||||
- 0.7.1
|
||||
- 7.1.1, 7.1.0
|
||||
- 7.1.0
|
||||
* - 0.6.0
|
||||
- 0.6.2, 0.6.0
|
||||
- 7.0.2, 7.0.1, 7.0.0
|
||||
@@ -269,6 +269,33 @@ For a complete and up-to-date list of JAX public modules (for example, ``jax.num
|
||||
JAX API modules are maintained by the JAX project and is subject to change.
|
||||
Refer to the official Jax documentation for the most up-to-date information.
|
||||
|
||||
Key features and enhancements for ROCm 7.1
|
||||
===============================================================================
|
||||
|
||||
- Enabled compilation of multihost HLO runner Python bindings.
|
||||
|
||||
- Backported multihost HLO runner bindings and some related changes to
|
||||
:code:`FunctionalHloRunner`.
|
||||
|
||||
- Added :code:`requirements_lock_3_12` to enable building for Python 3.12.
|
||||
|
||||
- Removed hardcoded NHWC convolution layout for ``fp16`` precision to address the performance drops for ``fp16`` precision on gfx12xx GPUs.
|
||||
|
||||
|
||||
- ROCprofiler-SDK integration:
|
||||
|
||||
- Integrated ROCprofiler-SDK (v3) to XLA to improve profiling of GPU events,
|
||||
support both time-based and step-based profiling.
|
||||
|
||||
- Added unit tests for :code:`rocm_collector` and :code:`rocm_tracer`.
|
||||
|
||||
- Added Triton unsupported conversion from ``f8E4M3FNUZ`` to ``fp16`` with
|
||||
rounding mode.
|
||||
|
||||
- Introduced :code:`CudnnFusedConvDecomposer` to revert fused convolutions
|
||||
when :code:`ConvAlgorithmPicker` fails to find a fused algorithm, and removed
|
||||
unfused fallback paths from :code:`RocmFusedConvRunner`.
|
||||
|
||||
Key features and enhancements for ROCm 7.0
|
||||
===============================================================================
|
||||
|
||||
|
||||
@@ -349,7 +349,7 @@ with ROCm.
|
||||
you need to explicitly move audio data (waveform tensor) to GPU using
|
||||
``.to('cuda')``.
|
||||
|
||||
* - `torchtune <https://meta-pytorch.org/torchtune/stable/index.html>`_
|
||||
* - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
|
||||
- PyTorch-native library designed for fine-tuning large language models
|
||||
(LLMs). Provides supports the full fine-tuning workflow and offers
|
||||
compatibility with popular production inference systems.
|
||||
@@ -366,7 +366,7 @@ with ROCm.
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
in prototype stage.
|
||||
|
||||
* - `torchrec <https://meta-pytorch.org/torchrec/>`_
|
||||
* - `torchrec <https://docs.pytorch.org/torchrec/>`_
|
||||
- PyTorch domain library for common sparsity and parallelism primitives
|
||||
needed for large-scale recommender systems, enabling authors to train
|
||||
models with large embedding tables shared across many GPUs.
|
||||
@@ -399,28 +399,6 @@ with ROCm.
|
||||
|
||||
**Note:** Only official release exists.
|
||||
|
||||
Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
|
||||
================================================================================
|
||||
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b.
|
||||
|
||||
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later.
|
||||
|
||||
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization.
|
||||
|
||||
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations.
|
||||
|
||||
- Improved tensor.item() performance by removing redundant synchronization.
|
||||
|
||||
- Enhanced performance for element-wise operations and reduction kernels.
|
||||
|
||||
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components.
|
||||
|
||||
- Resolved device error in Inductor when using CUDA graph trees with HIP.
|
||||
|
||||
- Corrected logsumexp scaling in AOTriton-based SDPA implementation.
|
||||
|
||||
- Added stream graph capture status validation in memory copy synchronization functions.
|
||||
|
||||
Key features and enhancements for PyTorch 2.8 with ROCm 7.1
|
||||
================================================================================
|
||||
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Taichi compatibility
|
||||
:keywords: GPU, Taichi, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Taichi compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel
|
||||
programming language designed for high-performance numerical computation.
|
||||
Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate
|
||||
compute-intensive Python code by compiling it to native GPU or CPU instructions.
|
||||
|
||||
Taichi is widely used across various domains, including real-time physical simulation,
|
||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
||||
visual effects in film and gaming, and general-purpose computing.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi
|
||||
<https://github.com/ROCm/taichi>`__ repository, which differs from the
|
||||
`https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
|
||||
|
||||
- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`,
|
||||
which includes ROCm, Taichi, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
|
||||
- **Upcoming Support**: AMD Instinct™ MI300X
|
||||
|
||||
.. _taichi-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
|
||||
blog highlights Taichi as an open-source programming language designed for high-performance
|
||||
numerical computation, particularly in domains like real-time physical simulation,
|
||||
artificial intelligence, computer vision, robotics, and visual effects. Taichi
|
||||
is embedded in Python and uses just-in-time (JIT) compilation frameworks like
|
||||
LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
|
||||
of Taichi in enabling complex simulations and numerical algorithms, making
|
||||
it ideal for developers working on compute-intensive tasks. Developers are
|
||||
encouraged to follow recommended coding patterns and utilize Taichi decorators
|
||||
for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples
|
||||
<https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images
|
||||
integrating ROCm, PyTorch, and Taichi are provided for simplified installation
|
||||
and deployment, making it easier to leverage Taichi for advanced computational workloads.
|
||||
|
||||
.. _taichi-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest Taichi version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Taichi
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
|
||||
- `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
|
||||
- `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
|
||||
- 22.04
|
||||
- `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
@@ -136,7 +136,7 @@ The following section maps supported data types and GPU-accelerated TensorFlow
|
||||
features to their minimum supported ROCm and TensorFlow versions.
|
||||
|
||||
Data types
|
||||
---------------
|
||||
-----------------
|
||||
|
||||
The data type of a tensor is specified using the ``dtype`` attribute or
|
||||
argument, and TensorFlow supports a wide range of data types for different use
|
||||
@@ -254,7 +254,7 @@ are as follows:
|
||||
- 1.7
|
||||
|
||||
Features
|
||||
---------------
|
||||
-----------------
|
||||
|
||||
This table provides an overview of key features in TensorFlow and their
|
||||
availability in ROCm.
|
||||
@@ -346,7 +346,7 @@ availability in ROCm.
|
||||
- 1.9.2
|
||||
|
||||
Distributed library features
|
||||
-----------------------------------
|
||||
-------------------------------------
|
||||
|
||||
Enables developers to scale computations across multiple devices on a single machine or
|
||||
across multiple machines.
|
||||
|
||||
@@ -31,7 +31,7 @@ Support overview
|
||||
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
|
||||
which includes ROCm, verl, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
|
||||
@@ -40,8 +40,7 @@ Support overview
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
verl is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
|
||||
`ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
|
||||
verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
@@ -58,7 +57,7 @@ Use cases and recommendations
|
||||
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
|
||||
(verl) framework integrates with the AMD ROCm platform to optimize training on
|
||||
AMD Instinct™ GPUs. The guide details the process of building a Docker image,
|
||||
Instinct™ MI300X GPUs. The guide details the process of building a Docker image,
|
||||
setting up single-node and multi-node training environments, and highlights
|
||||
performance benchmarks demonstrating improved throughput and convergence accuracy.
|
||||
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
|
||||
@@ -80,20 +79,12 @@ The following table shows verl on ROCm support for GPU-accelerated modules.
|
||||
- ROCm version
|
||||
* - ``FSDP``
|
||||
- Training engine
|
||||
-
|
||||
* 0.6.0
|
||||
* 0.3.0.post0
|
||||
-
|
||||
* 7.0.0
|
||||
* 6.2.0
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
* - ``vllm``
|
||||
- Inference engine
|
||||
-
|
||||
* 0.6.0
|
||||
* 0.3.0.post0
|
||||
-
|
||||
* 7.0.0
|
||||
* 6.2.0
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
|
||||
.. _verl-docker-compat:
|
||||
|
||||
@@ -104,45 +95,28 @@ Docker image compatibility
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- PyTorch
|
||||
- Python
|
||||
- vllm
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- Pytorch
|
||||
- Python
|
||||
- vllm
|
||||
|
||||
* - .. raw:: html
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.6.0.amd0_rocm7.0_vllm0.11.0.dev/images/sha256-f70a3ebc94c1f66de42a2fcc3f8a6a8d6d0881eb0e65b6958d7d6d24b3eecb0d"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `0.6.0 <https://github.com/volcengine/verl/releases/tag/v0.6.0>`__
|
||||
- 22.04
|
||||
- `2.9.0 <https://github.com/ROCm/pytorch/tree/release/2.9-rocm7.x-gfx115x>`__
|
||||
- `3.12.11 <https://www.python.org/downloads/release/python-31211/>`__
|
||||
- `0.11.0 <https://github.com/vllm-project/vllm/releases/tag/v0.11.0>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__
|
||||
- `0.3.0.post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`__
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`__
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`__
|
||||
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/verl-history` to find documentation for previous releases
|
||||
of the ``ROCm/verl`` Docker image.
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_
|
||||
- `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`_
|
||||
|
||||
13
docs/conf.py
13
docs/conf.py
@@ -81,7 +81,7 @@ latex_elements = {
|
||||
}
|
||||
|
||||
html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
|
||||
html_context = {"docs_header_version": "7.1.1"}
|
||||
html_context = {"docs_header_version": "7.1.0"}
|
||||
if os.environ.get("READTHEDOCS", "") == "True":
|
||||
html_context["READTHEDOCS"] = True
|
||||
|
||||
@@ -93,15 +93,15 @@ project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "7.1.1"
|
||||
release = "7.1.1"
|
||||
version = "7.1.0"
|
||||
release = "7.1.0"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-11-26"},
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-10-30"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
@@ -111,7 +111,6 @@ article_pages = [
|
||||
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
|
||||
@@ -145,7 +144,6 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
@@ -250,6 +248,3 @@ html_context = {
|
||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||
}
|
||||
|
||||
# Disable figure and table numbering
|
||||
numfig = False
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.9.0
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -1,109 +0,0 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
- version: v25-11
|
||||
pull_tag: rocm/pytorch-xdit:v25.11
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.10.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "Minor bug fixes and clarifications to READMEs."
|
||||
- "Bumps TheRock, AITER, Diffusers, xDiT versions."
|
||||
- "Changes Aiter rounding mode for faster gfx942 FWD Attention."
|
||||
components:
|
||||
TheRock: 3e3f834
|
||||
rccl: d23d18f
|
||||
composable_kernel: 2570462
|
||||
rocm-libraries: 0588f07
|
||||
rocm-systems: 473025a
|
||||
torch: 73adac
|
||||
torchvision: f5c6c2e
|
||||
triton: 7416ffc
|
||||
accelerate: 34c1779
|
||||
aiter: de14bec
|
||||
diffusers: 40528e9
|
||||
xfuser: 83978b5
|
||||
yunchang: 2c9b712
|
||||
|
||||
- version: v25-10
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.9.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "First official xDiT Docker Release for Diffusion Inference."
|
||||
- "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
|
||||
- "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
page_tag: hunyuan_tag
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
page_tag: wan_21_tag
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
page_tag: wan_22_tag
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
page_tag: flux_1_tag
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -1,17 +1,21 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -1,65 +0,0 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
config_name: llama3.3_70B-pretrain.yaml
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
config_name: llama3.1_70B-pretrain.yaml
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
config_name: llama3.1_8B-pretrain.yaml
|
||||
- model: Llama 2 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||
config_name: llama2_7B-pretrain.yaml
|
||||
- model: Llama 2 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||
config_name: llama2_70B-pretrain.yaml
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
config_name: deepseek_v3-pretrain.yaml
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
config_name: deepseek_v2_lite-pretrain.yaml
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
config_name: qwen2.5_72B-pretrain.yaml
|
||||
@@ -1,39 +0,0 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_8b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_8b_fsdp_fp8.toml"
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_70b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_70b_fsdp_fp8.toml"
|
||||
@@ -1,186 +0,0 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora]
|
||||
- group: OpenAI
|
||||
tag: openai
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_train_gpt_oss_20b
|
||||
model_repo: GPT-OSS-20B
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_train_gpt_oss_120b
|
||||
model_repo: GPT-OSS-120B
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 3 8B
|
||||
mad_tag: pyt_train_qwen3-8b
|
||||
model_repo: Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 3 32B
|
||||
mad_tag: pyt_train_qwen3-32b
|
||||
model_repo: Qwen3-32
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 32B
|
||||
mad_tag: pyt_train_qwen2.5-32b
|
||||
model_repo: Qwen2.5-32B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_train_qwen2.5-72b
|
||||
model_repo: Qwen2.5-72B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2 1.5B
|
||||
mad_tag: pyt_train_qwen2-1.5b
|
||||
model_repo: Qwen2-1.5B
|
||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 2 7B
|
||||
mad_tag: pyt_train_qwen2-7b
|
||||
model_repo: Qwen2-7B
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Stable Diffusion
|
||||
tag: sd
|
||||
models:
|
||||
- model: Stable Diffusion XL
|
||||
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
|
||||
precision: FP32
|
||||
@@ -1,15 +1,22 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,32 +1,39 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V2 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v2
|
||||
model_repo: DeepSeek-V2
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
||||
precision: BF16
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_8b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_8b_fsdp_fp8.toml"
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_70b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_70b_fsdp_fp8.toml"
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
@@ -113,15 +119,6 @@ model_groups:
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V2 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v2
|
||||
model_repo: DeepSeek-V2
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
@@ -169,7 +166,7 @@ model_groups:
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [posttrain]
|
||||
training_modes: [posttrain-p]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
@@ -178,20 +175,12 @@ model_groups:
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [posttrain]
|
||||
training_modes: [posttrain-p]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/ROCm/FluxBenchmark
|
||||
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
|
||||
precision: FP32
|
||||
- group: DLRM
|
||||
tag: dlrm
|
||||
models:
|
||||
- model: DLRM v2
|
||||
mad_tag: pyt_train_dlrm
|
||||
model_repo: DLRM
|
||||
url: https://github.com/AMD-AGI/DLRMBenchmark
|
||||
training_modes: [pretrain]
|
||||
|
||||
@@ -32,7 +32,7 @@ library_groups:
|
||||
|
||||
- name: "MIGraphX"
|
||||
tag: "migraphx"
|
||||
doc_link: "amdmigraphx:reference/cpp"
|
||||
doc_link: "amdmigraphx:reference/MIGraphX-cpp"
|
||||
data_types:
|
||||
- type: "int8"
|
||||
support: "⚠️"
|
||||
@@ -290,7 +290,7 @@ library_groups:
|
||||
|
||||
- name: "Tensile"
|
||||
tag: "tensile"
|
||||
doc_link: "tensile:reference/precision-support"
|
||||
doc_link: "tensile:src/reference/precision-support"
|
||||
data_types:
|
||||
- type: "int8"
|
||||
support: "✅"
|
||||
|
||||
@@ -100,18 +100,6 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
||||
|
||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
|
||||
torchtune for fine-tuning and inference
|
||||
=============================================
|
||||
|
||||
`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||
`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||
model fine-tuning and inference with LLMs.
|
||||
|
||||
#. Install torchtune using pip.
|
||||
|
||||
@@ -1,396 +0,0 @@
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. _xdit-video-diffusion:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
|
||||
a prebuilt, optimized inference environment based on `xDiT
|
||||
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion-based
|
||||
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
|
||||
and MI300X (gfx942) GPUs.
|
||||
This image is based on ROCm {{docker.ROCm}} preview release via `TheRock <https://github.com/ROCm/TheRock>`_
|
||||
and includes the following software components:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
- Initial ROCm-enabled xDiT Docker release for diffusion inference.
|
||||
- Supported architectures: gfx942 and gfx950 (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X).
|
||||
- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models.
|
||||
|
||||
.. _xdit-video-diffusion-supported-models:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length == 1 %}
|
||||
<div class="col-12 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA
|
||||
auto-balancing, you can skip this step. Otherwise, complete the procedures in
|
||||
the `System validation and optimization
|
||||
<https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.html>`__
|
||||
guide to properly configure your system settings before starting.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups %}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-setup:
|
||||
|
||||
Prepare the model
|
||||
-----------------
|
||||
|
||||
.. note::
|
||||
|
||||
If you're using ROCm MAD to :ref:`run your model
|
||||
<xdit-video-diffusion-run>`, you can skip this section. MAD will handle
|
||||
starting the container and downloading required models inside the container.
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-run:
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
You can benchmark models through `MAD <https://github.com/ROCm/MAD>`__-integrated automation or standalone
|
||||
torchrun commands.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model tencent/HunyuanVideo \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd Wan2.1
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
|
||||
--image "/app/Wan2.1/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd Wan2.2
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-A14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
|
||||
--image "/app/Wan2.2/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model black-forest-labs/FLUX.1-dev \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 1 \
|
||||
--benchmark_output_directory results
|
||||
|
||||
{% endif %}
|
||||
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see `AMD
|
||||
Infinity Hub
|
||||
<https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`__.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`xdit-history` to find documentation for previous releases
|
||||
of xDiT diffusion inference performance testing.
|
||||
@@ -1,56 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
************************************************************
|
||||
xDiT diffusion inference performance testing version history
|
||||
************************************************************
|
||||
|
||||
This table lists previous versions of the ROCm xDiT diffusion inference performance
|
||||
testing environment. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image tag
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.11`` (latest)
|
||||
-
|
||||
* ROCm 7.10.0 preview
|
||||
* TheRock 3e3f834
|
||||
* rccl d23d18f
|
||||
* composable_kernel 2570462
|
||||
* rocm-libraries 0588f07
|
||||
* rocm-systems 473025a
|
||||
* torch 73adac
|
||||
* torchvision f5c6c2e
|
||||
* triton 7416ffc
|
||||
* accelerate 34c1779
|
||||
* aiter de14bec
|
||||
* diffusers 40528e9
|
||||
* xfuser 83978b5
|
||||
* yunchang 2c9b712
|
||||
-
|
||||
* :doc:`Documentation <../../xdit-diffusion-inference>`
|
||||
* `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.10``
|
||||
-
|
||||
* ROCm 7.9.0 preview
|
||||
* TheRock 7afbe45
|
||||
* rccl 9b04b2a
|
||||
* composable_kernel b7a806f
|
||||
* rocm-libraries f104555
|
||||
* rocm-systems 25922d0
|
||||
* torch 2.10.0a0+gite9c9017
|
||||
* torchvision 0.22.0a0+966da7e
|
||||
* triton 3.5.0+git52e49c12
|
||||
* accelerate 1.11.0.dev0
|
||||
* aiter 0.1.5.post4.dev20+ga25e55e79
|
||||
* diffusers 0.36.0.dev0
|
||||
* xfuser 0.4.4
|
||||
* yunchang 0.6.3.post1
|
||||
-
|
||||
* :doc:`Documentation <xdit-25.10>`
|
||||
* `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
|
||||
@@ -27,6 +27,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
- :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
|
||||
|
||||
- :doc:`Deploying your model <deploy-your-model>`
|
||||
|
||||
- :doc:`xDiT diffusion inference <xdit-diffusion-inference>`
|
||||
|
||||
|
||||
@@ -1,388 +0,0 @@
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. _xdit-video-diffusion:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
|
||||
benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
|
||||
The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
|
||||
and includes the following components:
|
||||
|
||||
.. dropdown:: Software components
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for item in docker.whats_new %}
|
||||
* {{ item }}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-supported-models:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups %}
|
||||
|
||||
{# Create a lookup for supported models #}
|
||||
{% set supported_lookup = {} %}
|
||||
{% for supported in docker.supported_models %}
|
||||
{% set _ = supported_lookup.update({supported.group: supported.models}) %}
|
||||
{% endfor %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% if model_group.group in supported_lookup %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% if model_group.group in supported_lookup %}
|
||||
{% set supported_models = supported_lookup[model_group.group] %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if model.model in supported_models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.page_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.page_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Choose your setup method
|
||||
------------------------
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{model.page_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.page_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model tencent/HunyuanVideo \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd Wan2.1
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
|
||||
--image "/app/Wan2.1/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd Wan2.2
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-A14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
|
||||
--image "/app/Wan2.2/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model black-forest-labs/FLUX.1-dev \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 1 \
|
||||
--benchmark_output_directory results
|
||||
|
||||
{% endif %}
|
||||
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
|
||||
of xDiT diffusion inference performance testing.
|
||||
@@ -254,7 +254,7 @@ PyTorch training
|
||||
The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
|
||||
with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
|
||||
following example refers to the legacy workflow :ref:`Training a
|
||||
model with PyTorch <amd-pytorch-training-multinode-examples-v259>`.
|
||||
model with PyTorch <amd-pytorch-training-multinode-examples>`.
|
||||
|
||||
1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
|
||||
|
||||
@@ -277,7 +277,7 @@ PyTorch training
|
||||
|
||||
.. seealso::
|
||||
|
||||
See :ref:`Training a model with PyTorch <amd-pytorch-training-multinode-examples-v259>` for more examples and information.
|
||||
See :ref:`Training a model with PyTorch <amd-pytorch-training-multinode-examples>` for more examples and information.
|
||||
|
||||
Megatron-LM
|
||||
-----------
|
||||
|
||||
@@ -36,10 +36,12 @@ accelerate training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -47,12 +49,12 @@ accelerate training workloads:
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-megatron-lm-model-support-v2510:
|
||||
{% endfor %}
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -97,7 +99,7 @@ accelerate training workloads:
|
||||
Some models, such as Llama, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
.. _amd-megatron-lm-performance-measurements-v2510:
|
||||
.. _amd-megatron-lm-performance-measurements:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -129,7 +131,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training-v2510:
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
@@ -138,38 +140,52 @@ Use the following instructions to set up the environment, configure the script t
|
||||
reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements-v2510:
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set dockers = data.dockers %}
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 128G \
|
||||
--name megatron_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 128G \
|
||||
--name megatron_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||
|
||||
@@ -190,7 +206,7 @@ Download the Docker image
|
||||
The Docker container hosts a verified commit of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
|
||||
|
||||
.. _amd-megatron-lm-environment-setup-v2510:
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
@@ -200,39 +216,39 @@ Configuration
|
||||
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
|
||||
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
|
||||
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
|
||||
.. note::
|
||||
|
||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v2510>` for more information on configuration options.
|
||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
@@ -240,7 +256,7 @@ Multi-node configuration
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
|
||||
|
||||
.. _amd-megatron-lm-tokenizer-v2510:
|
||||
.. _amd-megatron-lm-tokenizer:
|
||||
|
||||
Tokenizer
|
||||
---------
|
||||
@@ -377,7 +393,7 @@ Download the dataset
|
||||
|
||||
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
|
||||
Remember to either pre-download the tokenizer or setup Hugging Face access
|
||||
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v2510>` section.
|
||||
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -479,38 +495,15 @@ Download the dataset
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. _amd-megatron-lm-run-training-v2510:
|
||||
.. _amd-megatron-lm-run-training:
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Use the following example commands to set up the environment, configure
|
||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v2510>`, and run training on
|
||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
|
||||
MI300X Series GPUs with the AMD Megatron-LM environment.
|
||||
|
||||
Before starting training, export the following environment variables.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
.. tab-item:: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
# Set this on MI325X/MI300X only
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
Single node training
|
||||
--------------------
|
||||
|
||||
@@ -920,7 +913,7 @@ Single node training
|
||||
RECOMPUTE_ACTIVATIONS=full \
|
||||
CKPT_FORMAT=torch_dist
|
||||
|
||||
.. _amd-megatron-lm-multi-node-examples-v2510:
|
||||
.. _amd-megatron-lm-multi-node-examples:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
@@ -971,7 +964,7 @@ training on 16 nodes, try the following command:
|
||||
|
||||
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars-v2510:
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
|
||||
Key options
|
||||
-----------
|
||||
@@ -1036,6 +1029,11 @@ The benchmark tests support the following sets of variables.
|
||||
``RECOMPUTE_NUM_LAYERS``
|
||||
Number of layers used for checkpointing recompute.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
|
||||
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.10 (latest)
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.9
|
||||
* - v25.9 (latest)
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* Primus 0.3.0
|
||||
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <primus-megatron-v25.9>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.9>`
|
||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
||||
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,574 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
****************************************
|
||||
Training a model with Primus and PyTorch
|
||||
****************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm Primus PyTorch training
|
||||
performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
|
||||
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
||||
Primus now supports the PyTorch torchtitan backend.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
||||
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||
:doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
|
||||
:doc:`../pytorch-training` to see steps to run workloads without Primus.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
||||
MI300X GPUs containing essential components for Primus and PyTorch training
|
||||
with Primus Turbo optimizations.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-pytorch-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. seealso::
|
||||
|
||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||
see the documentation :doc:`../pytorch-training` (without Primus)
|
||||
|
||||
.. _amd-primus-pytorch-performance-measurements-v259:
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
||||
For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
|
||||
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
||||
tweak some configurations (such as batch sizes).
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, Primus torchtitan models are run with Primus Turbo
|
||||
enabled for enhanced performance. To disable Primus Turbo,
|
||||
modify respective configuration file
|
||||
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Primus benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To get started, navigate to the ``Primus`` directory in your container.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd /workspace/Primus
|
||||
|
||||
Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
|
||||
included with Primus with the appropriate options.
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-8b
|
||||
|
||||
Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
|
||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 7
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||
|
||||
Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 3
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone torchtitan benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. Navigate to the ``torchtitan`` workspace directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/torchtitan
|
||||
|
||||
.. rubric:: Download the tokenizer
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Download the tokenizer for your model.
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 scripts/download_tokenizer.py \
|
||||
--repo_id {{ model.model_repo }} \
|
||||
--tokenizer_path "original" \
|
||||
--hf_token=${HF_TOKEN}
|
||||
|
||||
.. rubric:: Pretraining examples
|
||||
|
||||
Run the training script with the appropriate configuration file.
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.bf16 }} \
|
||||
.run_train.sh
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.fp8 }} \
|
||||
.run_train.sh
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
|
||||
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.10 (latest)
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
|
||||
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.9
|
||||
* - v25.9 (latest)
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* Primus 0.3.0
|
||||
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
-
|
||||
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
|
||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
|
||||
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
|
||||
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
|
||||
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
||||
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
||||
|
||||
|
||||
@@ -240,7 +240,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html>`_
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
@@ -1,667 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch on ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm PyTorch training
|
||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
||||
|
||||
See :doc:`../primus-pytorch` for details.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
The PyTorch for ROCm training Docker image provides a prebuilt optimized
|
||||
environment for fine-tuning and pretraining a model on AMD Instinct MI325X
|
||||
and MI300X GPUs. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct
|
||||
MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
|
||||
training recommendations in this documentation might vary by model -- select
|
||||
one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _amd-pytorch-training-supported-training-modes-v259:
|
||||
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. dropdown:: Supported training modes
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- Supported training modes
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if model.training_modes %}
|
||||
* - {{ model.model }}
|
||||
- ``{{ model.training_modes | join('``, ``') }}``
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
Some model and fine-tuning combinations are not listed. This is
|
||||
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||
doesn't provide default YAML configurations for them.
|
||||
For advanced usage, you can create a custom configuration to enable
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v259:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`__
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`__
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`__
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`__
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`__
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"pretrain": "Benchmark pre-training.",
|
||||
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pre-training
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, FLUX models are not supported out-of-the-box on this Docker.
|
||||
To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"posttrain": "Benchmark post-training.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Post-training
|
||||
|
||||
To start the post-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_mode_descs = {
|
||||
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||
{% if available_modes %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||
.. note::
|
||||
|
||||
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||
use the following torchtune commit for compatibility:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||
|
||||
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||
.. note::
|
||||
|
||||
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||
input tensor should be smaller than max_seq_len (4096)``.
|
||||
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||
|
||||
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||
tokens in this case). You can resolve this by truncating the input or splitting
|
||||
it into smaller chunks before passing it to the model.
|
||||
|
||||
Note on reproducibility: The results in this guide are based on
|
||||
commit ``b4c98ac`` from the upstream
|
||||
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||
latest updates, you can use the main branch.
|
||||
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples-v259:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
|
||||
|
||||
Pre-training
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch run_slurm_train.sh
|
||||
|
||||
Fine-tuning
|
||||
~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli login # Get access to HF Llama model space
|
||||
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch Torchtune_Multinode.sh
|
||||
|
||||
.. note::
|
||||
|
||||
Information regarding benchmark setup:
|
||||
|
||||
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||
* You can adjust the torchtune `YAML configuration file
|
||||
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||
if you're using a different model.
|
||||
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||
* Set the ``mounting_paths`` inside the SLURM script.
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -31,10 +31,12 @@ Megatron-LM.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -42,12 +44,13 @@ Megatron-LM.
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-megatron-lm-model-support-v2510:
|
||||
.. _amd-primus-megatron-lm-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -108,7 +111,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-primus-megatron-lm-training-v2510:
|
||||
.. _mi300x-amd-primus-megatron-lm-training-v259:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
@@ -118,49 +121,63 @@ Environment setup
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on AMD Instinct GPUs.
|
||||
|
||||
.. _amd-primus-megatron-lm-requirements-v2510:
|
||||
.. _amd-primus-megatron-lm-requirements-v259:
|
||||
|
||||
Pull the Docker image
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set dockers = data.dockers %}
|
||||
|
||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
--shm-size 128G \
|
||||
--name primus_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||
.. code-block:: shell
|
||||
|
||||
.. code-block:: shell
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
--shm-size 128G \
|
||||
--name primus_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||
|
||||
The Docker container hosts verified branch ``release/v25.10`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/release/v25.10>`__ repository.
|
||||
.. code-block:: shell
|
||||
|
||||
.. _amd-primus-megatron-lm-environment-setup-v2510:
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
|
||||
The Docker container hosts verified commit ``e16b27b`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
|
||||
|
||||
.. _amd-primus-megatron-lm-environment-setup-v259:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
@@ -207,7 +224,7 @@ You can use either mock data or real data for training.
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. _amd-primus-megatron-lm-tokenizer-v2510:
|
||||
.. _amd-primus-megatron-lm-tokenizer-v259:
|
||||
|
||||
Tokenizer
|
||||
---------
|
||||
@@ -228,7 +245,7 @@ right permissions to access the tokenizer for each model.
|
||||
<https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||
definition.
|
||||
|
||||
.. _amd-primus-megatron-lm-run-training-v2510:
|
||||
.. _amd-primus-megatron-lm-run-training-v259:
|
||||
|
||||
Run training
|
||||
============
|
||||
@@ -252,7 +269,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.3 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.3 70B BF16, run:
|
||||
|
||||
@@ -263,7 +280,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 6 \
|
||||
@@ -274,12 +291,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 2 \
|
||||
@@ -289,7 +301,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 8B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 8B FP8, run:
|
||||
|
||||
@@ -300,7 +312,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
@@ -312,12 +324,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
@@ -331,7 +338,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
@@ -342,12 +349,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -355,7 +357,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 70B BF16, run:
|
||||
|
||||
@@ -366,7 +368,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
@@ -377,12 +379,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -401,7 +398,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
@@ -414,12 +411,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 40 \
|
||||
@@ -430,7 +422,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 7B FP8, run:
|
||||
|
||||
@@ -441,7 +433,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
@@ -453,12 +445,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
@@ -472,7 +459,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 10 \
|
||||
@@ -483,12 +470,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -496,7 +478,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 70B BF16, run:
|
||||
|
||||
@@ -507,7 +489,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 17 \
|
||||
@@ -518,12 +500,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -531,7 +508,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V3.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
|
||||
use the following command:
|
||||
@@ -543,7 +520,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 3 \
|
||||
--moe_layer_freq 1 \
|
||||
@@ -556,24 +533,17 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 3 \
|
||||
--moe_layer_freq 1 \
|
||||
--micro_batch_size 3 \
|
||||
--global_batch_size 192 \
|
||||
--train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V2-Lite.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
|
||||
use the following command:
|
||||
@@ -585,7 +555,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 12 \
|
||||
@@ -596,12 +566,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--global_batch_size 256
|
||||
@@ -610,7 +575,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||
use the following command:
|
||||
@@ -622,7 +587,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
@@ -633,12 +598,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -646,7 +606,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x22B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
|
||||
use the following command:
|
||||
@@ -658,7 +618,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 4 \
|
||||
@@ -671,12 +631,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 4 \
|
||||
@@ -688,7 +643,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||
command:
|
||||
@@ -700,7 +655,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 16 \
|
||||
@@ -711,12 +666,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -729,7 +679,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
@@ -741,12 +691,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
@@ -755,7 +700,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 72B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||
|
||||
@@ -766,7 +711,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 16 \
|
||||
@@ -777,16 +722,11 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
.. _amd-primus-megatron-multi-node-examples-v2510:
|
||||
.. _amd-primus-megatron-multi-node-examples-v259:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
@@ -800,27 +740,28 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
.. code-block:: shell
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
|
||||
cd Primus
|
||||
git checkout release/v25.10
|
||||
git submodule update --init --recursive
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||
export HF_TOKEN=<your_HF_token>
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||
.. code-block:: shell
|
||||
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
|
||||
cd Primus
|
||||
git checkout e16b27b
|
||||
|
||||
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||
export HF_TOKEN=<your_HF_token>
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -828,13 +769,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||
* To find your network interface, you can use ``ip a``.
|
||||
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate.
|
||||
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 8B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -852,7 +793,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Llama 2 7B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -870,7 +811,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -902,7 +843,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -934,7 +875,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.3 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -966,7 +907,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||
|
||||
@@ -984,7 +925,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
|
||||
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -1001,7 +942,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v2510:
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
|
||||
|
||||
Key options
|
||||
-----------
|
||||
@@ -1046,10 +987,7 @@ num_layers
|
||||
Known issues
|
||||
============
|
||||
|
||||
DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error
|
||||
due to a memory free issue. However, this does not impacts training runs. All
|
||||
iterations, in this case 50, should have been completed before the exit and
|
||||
the results should be available in the end.
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
@@ -29,10 +29,12 @@ with Primus Turbo optimizations.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -40,12 +42,13 @@ with Primus Turbo optimizations.
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-pytorch-model-support-v2510:
|
||||
.. _amd-primus-pytorch-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -64,7 +67,7 @@ vary by model -- select one to get started.
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
@@ -91,7 +94,7 @@ vary by model -- select one to get started.
|
||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||
see the documentation :doc:`pytorch-training` (without Primus)
|
||||
|
||||
.. _amd-primus-pytorch-performance-measurements-v2510:
|
||||
.. _amd-primus-pytorch-performance-measurements-v259:
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -117,11 +120,20 @@ Pull the Docker image
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker pull {{ data.docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Run training
|
||||
============
|
||||
@@ -133,7 +145,7 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. tab-set::
|
||||
@@ -146,7 +158,7 @@ tweak some configurations (such as batch sizes).
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -173,6 +185,13 @@ tweak some configurations (such as batch sizes).
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, Primus torchtitan models are run with Primus Turbo
|
||||
enabled for enhanced performance. To disable Primus Turbo,
|
||||
modify respective configuration file
|
||||
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
@@ -184,34 +203,48 @@ tweak some configurations (such as batch sizes).
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
@@ -250,28 +283,37 @@ tweak some configurations (such as batch sizes).
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 4
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
|
||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||
@@ -279,28 +321,37 @@ tweak some configurations (such as batch sizes).
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 7
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 7
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||
|
||||
@@ -313,57 +364,36 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 4
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 3
|
||||
|
||||
.. container:: model-doc primus_pyt_train_deepseek-v2
|
||||
|
||||
Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
@@ -371,55 +401,151 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 16
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 10
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 3
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
To train DeepSeek V2 16B with FP8 precision, use the following command.
|
||||
.. tab-item:: Standalone torchtitan benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 16
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. Navigate to the ``torchtitan`` workspace directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/torchtitan
|
||||
|
||||
.. rubric:: Download the tokenizer
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Download the tokenizer for your model.
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 scripts/download_tokenizer.py \
|
||||
--repo_id {{ model.model_repo }} \
|
||||
--tokenizer_path "original" \
|
||||
--hf_token=${HF_TOKEN}
|
||||
|
||||
.. rubric:: Pretraining examples
|
||||
|
||||
Run the training script with the appropriate configuration file.
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.bf16 }} \
|
||||
.run_train.sh
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.fp8 }} \
|
||||
.run_train.sh
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -27,10 +27,12 @@ training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -38,12 +40,13 @@ training workloads:
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support-v2510:
|
||||
.. _amd-pytorch-training-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -85,7 +88,7 @@ one to get started.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _amd-pytorch-training-supported-training-modes-v2510:
|
||||
.. _amd-pytorch-training-supported-training-modes-v259:
|
||||
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
@@ -120,7 +123,7 @@ The following table lists supported training modes per model.
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v2510:
|
||||
.. _amd-pytorch-training-performance-measurements-v259:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -161,7 +164,7 @@ Run training
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
@@ -176,7 +179,7 @@ Run training
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -214,7 +217,7 @@ Run training
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
@@ -223,28 +226,42 @@ Run training
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-set::
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
@@ -402,34 +419,11 @@ Run training
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pretraining
|
||||
.. rubric:: Pre-training
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
{% if model.mad_tag == "pyt_train_dlrm" %}
|
||||
|
||||
1. Go to the DLRM directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/DLRMBenchmark
|
||||
|
||||
2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
|
||||
run the following script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./launch_training_single_node.sh
|
||||
|
||||
To run with MAD within the Docker container, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -m DLRM
|
||||
|
||||
{% else %}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
@@ -472,7 +466,6 @@ Run training
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% set training_modes = model.training_modes %}
|
||||
@@ -532,7 +525,7 @@ Run training
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
@@ -597,7 +590,7 @@ Run training
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples-v2510:
|
||||
.. _amd-pytorch-training-multinode-examples-v259:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
@@ -646,6 +639,11 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ The following table shows whether a ROCm library is graph-safe.
|
||||
- ⚠️ (experimental)
|
||||
*
|
||||
- `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||
- ❌
|
||||
- ❌ (see :doc:`details <rocthrust:reference/rocThrust-hipgraph-support>`)
|
||||
*
|
||||
- `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||
- ❌
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
| Version | Release date |
|
||||
| ------- | ------------ |
|
||||
| [7.1.1](https://rocm.docs.amd.com/en/docs-7.1.1/) | November 26, 2025 |
|
||||
| [7.1.0](https://rocm.docs.amd.com/en/docs-7.1.0/) | October 30, 2025 |
|
||||
| [7.0.2](https://rocm.docs.amd.com/en/docs-7.0.2/) | October 10, 2025 |
|
||||
| [7.0.1](https://rocm.docs.amd.com/en/docs-7.0.1/) | September 17, 2025 |
|
||||
|
||||
@@ -43,8 +43,6 @@ subtrees:
|
||||
title: DGL compatibility
|
||||
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||
title: Megablocks compatibility
|
||||
- file: compatibility/ml-compatibility/taichi-compatibility.rst
|
||||
title: Taichi compatibility
|
||||
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
||||
title: Ray compatibility
|
||||
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
||||
@@ -117,8 +115,6 @@ subtrees:
|
||||
title: SGLang inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
||||
title: SGLang distributed inference with Mooncake
|
||||
- file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
|
||||
title: xDiT diffusion inference
|
||||
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||
title: Deploy your model
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.31.0
|
||||
rocm-docs-core==1.30.0
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -132,7 +132,6 @@ nest-asyncio==1.6.0
|
||||
packaging==25.0
|
||||
# via
|
||||
# ipykernel
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
parso==0.8.5
|
||||
# via jedi
|
||||
@@ -150,7 +149,7 @@ pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pycparser==2.23
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.15.4
|
||||
pydata-sphinx-theme==0.16.1
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
@@ -188,7 +187,7 @@ requests==2.32.5
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.31.0
|
||||
rocm-docs-core==1.30.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.29.0
|
||||
# via
|
||||
@@ -218,7 +217,7 @@ sphinx==8.1.3
|
||||
# sphinx-reredirects
|
||||
# sphinxcontrib-datatemplates
|
||||
# sphinxcontrib-runcmd
|
||||
sphinx-book-theme==1.1.4
|
||||
sphinx-book-theme==1.1.3
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.2
|
||||
# via rocm-docs-core
|
||||
|
||||
@@ -124,26 +124,3 @@
|
||||
#rocm-rn-components:has(tbody.rocm-components-runtimes td:hover) tr:hover > td {
|
||||
background-color: var(--pst-color-table-row-hover-bg);
|
||||
}
|
||||
|
||||
/* Left-align text + vertically center content for any table using this class */
|
||||
.table--middle-left {
|
||||
border-collapse: collapse; /* optional but typical for docs tables */
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.table--middle-left th,
|
||||
.table--middle-left td {
|
||||
text-align: left;
|
||||
vertical-align: middle !important; /* override Bootstrap/Sphinx defaults */
|
||||
padding: 0.5rem; /* optional: adjust to your spacing scale */
|
||||
}
|
||||
|
||||
/* Normalize paragraphs inside cells so margins don't disrupt centering */
|
||||
.table--middle-left th p,
|
||||
.table--middle-left td p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
div.sd-row ul {
|
||||
padding-left: 2rem;
|
||||
}
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-7.1.1"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm-examples" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIPIFY" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<!-- The following components have been migrated to rocm-libraries:
|
||||
hipBLAS-common hipBLAS hipBLASLt hipCUB
|
||||
hipFFT hipRAND hipSPARSE hipSPARSELt
|
||||
MIOpen rocBLAS rocFFT rocPRIM rocRAND
|
||||
rocSPARSE rocThrust Tensile -->
|
||||
<project groups="mathlibs" name="rocm-libraries" />
|
||||
<!-- The following components have been migrated to rocm-systems:
|
||||
aqlprofile clr hip hip-tests hipother
|
||||
rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute
|
||||
rocprofiler-register rocprofiler-sdk rocprofiler-systems
|
||||
rocprofiler rocr-runtime roctracer -->
|
||||
<project groups="mathlibs" name="rocm-systems" />
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
Reference in New Issue
Block a user