mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 22:58:17 -05:00
Compare commits
77 Commits
target-ext
...
radeon_ref
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75f714c038 | ||
|
|
0ea5216ace | ||
|
|
2e1b4dd5ee | ||
|
|
2d79b3c4bd | ||
|
|
fd59b5fbac | ||
|
|
0a643f4686 | ||
|
|
d9e5744f7a | ||
|
|
ccb849ec02 | ||
|
|
42d4867964 | ||
|
|
375359a5dd | ||
|
|
e92745f1ff | ||
|
|
0fa72358d3 | ||
|
|
6fec268a4e | ||
|
|
ff14cd1ff5 | ||
|
|
8f65688653 | ||
|
|
33d1493adb | ||
|
|
4b6c7776a2 | ||
|
|
af811daa1b | ||
|
|
d6c045e482 | ||
|
|
78b24cad39 | ||
|
|
753a94c0bb | ||
|
|
6ecad57c62 | ||
|
|
977554809a | ||
|
|
7b00f4493b | ||
|
|
95c439a272 | ||
|
|
94e04fbdc0 | ||
|
|
7ab59de8af | ||
|
|
175c817563 | ||
|
|
25516d312e | ||
|
|
30c345629a | ||
|
|
210dc94bbb | ||
|
|
a54023ccb8 | ||
|
|
17e3362dc7 | ||
|
|
0f9c0d884d | ||
|
|
c890de4b16 | ||
|
|
4ea77ab515 | ||
|
|
c0512612f4 | ||
|
|
1c81ac3747 | ||
|
|
4bafa42e52 | ||
|
|
493801e670 | ||
|
|
1a5152b7b3 | ||
|
|
874c881012 | ||
|
|
bdcaeea74c | ||
|
|
b02669acf7 | ||
|
|
844f10b2b1 | ||
|
|
d6c14920b4 | ||
|
|
4affe10a7c | ||
|
|
81341ef435 | ||
|
|
abacd328f9 | ||
|
|
80b2fb6e26 | ||
|
|
b53e8decfc | ||
|
|
5fcc2eafde | ||
|
|
2eb0d77bc6 | ||
|
|
d84b41908f | ||
|
|
986f8284d1 | ||
|
|
d92d9268dc | ||
|
|
1629d3f0ea | ||
|
|
6cf6b34b2e | ||
|
|
c35a0a121a | ||
|
|
412e383654 | ||
|
|
39f6fc187d | ||
|
|
05b480fb28 | ||
|
|
4fa44d90db | ||
|
|
c9ef13d823 | ||
|
|
f02172050b | ||
|
|
154dbe297a | ||
|
|
993a0a4fd4 | ||
|
|
c03662f410 | ||
|
|
442d7e4750 | ||
|
|
a09a8f517e | ||
|
|
0bbaab645d | ||
|
|
4b80405e2e | ||
|
|
d92e5b6c12 | ||
|
|
91fce2e134 | ||
|
|
27d53cf082 | ||
|
|
bc084246be | ||
|
|
9827ba7ff2 |
@@ -79,7 +79,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- task: Bash@3
|
||||
displayName: Add lit to PATH
|
||||
inputs:
|
||||
|
||||
@@ -131,7 +131,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -212,7 +212,7 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
174
.azuredevops/components/aqlprofile.yml
Normal file
174
.azuredevops/components/aqlprofile.yml
Normal file
@@ -0,0 +1,174 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: aqlprofile
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
type: boolean
|
||||
default: false
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- git
|
||||
- ninja-build
|
||||
- python3-pip
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-register
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
consolidateBuildAndInstall: true
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/aqlprofile/cmake_modules
|
||||
-DAQLPROFILE_BUILD_TESTS=ON
|
||||
-DGPU_TARGETS=${{ job.target }}
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/rocm/share/hsa-amd-aqlprofile/
|
||||
testExecutable: ./run_tests.sh
|
||||
testParameters: ''
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
@@ -77,6 +77,7 @@ parameters:
|
||||
- clr
|
||||
- hipBLAS-common
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- rocm_smi_lib
|
||||
- rocprofiler-register
|
||||
@@ -144,7 +145,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -54,6 +54,7 @@ parameters:
|
||||
- hipSPARSE
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocm-cmake
|
||||
- rocm_smi_lib
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
@@ -67,6 +68,7 @@ parameters:
|
||||
- llvm-project
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- rocm-cmake
|
||||
- rocBLAS
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
@@ -110,7 +112,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -39,6 +39,9 @@ parameters:
|
||||
- python3
|
||||
- python3-dev
|
||||
- python3-pip
|
||||
- libgtest-dev
|
||||
- libboost-filesystem-dev
|
||||
- libboost-program-options-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
@@ -107,8 +110,12 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
@@ -125,7 +132,7 @@ jobs:
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||
-DORIGAMI_ENABLE_PYTHON=ON
|
||||
@@ -206,7 +213,15 @@ jobs:
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './origami-tests'
|
||||
testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- script: |
|
||||
set -e
|
||||
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
||||
|
||||
echo "--- Running origami_test.py ---"
|
||||
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rdc
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -33,6 +52,7 @@ parameters:
|
||||
- clr
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- hipRAND
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocm-cmake
|
||||
@@ -43,6 +63,7 @@ parameters:
|
||||
- rocprofiler
|
||||
- rocprofiler-register
|
||||
- rocprofiler-sdk
|
||||
- rocRAND
|
||||
- ROCR-Runtime
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
@@ -74,7 +95,11 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rdc_build_${{ job.target }}
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.target }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -85,16 +110,22 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
# Build grpc
|
||||
- task: Bash@3
|
||||
displayName: 'git clone grpc'
|
||||
@@ -104,6 +135,7 @@ jobs:
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
|
||||
cmakeSourceDir: $(Build.SourcesDirectory)/grpc
|
||||
installDir: $(Build.SourcesDirectory)/bin
|
||||
@@ -117,6 +149,7 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DGRPC_ROOT="$(Build.SourcesDirectory)/bin"
|
||||
@@ -126,9 +159,12 @@ jobs:
|
||||
-DAMDGPU_TARGETS=${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
@@ -136,60 +172,64 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rdc_test_${{ job.target }}
|
||||
dependsOn: rdc_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
- name: ROCM_DIR
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
- task: Bash@3
|
||||
displayName: Setup test environment
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
|
||||
echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
sudo ldconfig -v
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- task: Bash@3
|
||||
displayName: Test rdc
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: >-
|
||||
$(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
|
||||
--batch_mode
|
||||
--start_rdcd
|
||||
--unauth_comm
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraPaths: /home/user/workspace/rocm/bin
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.target }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
- name: ROCM_DIR
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
gpuTarget: ${{ job.target }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- task: Bash@3
|
||||
displayName: Setup test environment
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
|
||||
echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
sudo ldconfig -v
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- task: Bash@3
|
||||
displayName: Test rdc
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: >-
|
||||
$(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
|
||||
--batch_mode
|
||||
--start_rdcd
|
||||
--unauth_comm
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
extraPaths: /home/user/workspace/rocm/bin
|
||||
|
||||
@@ -70,6 +70,7 @@ parameters:
|
||||
- hipBLAS-common
|
||||
- hipBLASLt
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- rocprofiler-register
|
||||
- rocm_smi_lib
|
||||
@@ -154,7 +155,7 @@ jobs:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -33,6 +33,7 @@ parameters:
|
||||
- hipRAND
|
||||
- hipSOLVER
|
||||
- hipSPARSE
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocFFT
|
||||
@@ -43,6 +44,7 @@ parameters:
|
||||
- rocSOLVER
|
||||
- rocSPARSE
|
||||
- rocThrust
|
||||
- rocWMMA
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -57,6 +59,7 @@ parameters:
|
||||
- hipRAND
|
||||
- hipSOLVER
|
||||
- hipSPARSE
|
||||
- hipTensor
|
||||
- llvm-project
|
||||
- rocBLAS
|
||||
- rocFFT
|
||||
@@ -69,6 +72,7 @@ parameters:
|
||||
- rocSPARSE
|
||||
- rocThrust
|
||||
- roctracer
|
||||
- rocWMMA
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
@@ -97,6 +101,9 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
@@ -158,6 +165,9 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
parameters:
|
||||
cmakeVersion: '3.25.0'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -213,6 +213,7 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: $(Agent.BuildDirectory)/s/build
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
parameters:
|
||||
- name: cmakeVersion
|
||||
type: string
|
||||
default: '3.31.0'
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
displayName: Install CMake 3.31
|
||||
displayName: Install CMake ${{ parameters.cmakeVersion }}
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
CMAKE_VERSION=3.31.0
|
||||
CMAKE_VERSION=${{ parameters.cmakeVersion }}
|
||||
CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
|
||||
|
||||
echo "Downloading CMake $CMAKE_VERSION..."
|
||||
@@ -46,6 +46,10 @@ parameters:
|
||||
pipelineId: 115
|
||||
developBranch: aomp-dev
|
||||
hasGpuTarget: false
|
||||
aqlprofile:
|
||||
pipelineId: 365
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
clr:
|
||||
pipelineId: 335
|
||||
developBranch: develop
|
||||
@@ -126,13 +130,17 @@ parameters:
|
||||
pipelineId: 80
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
origami:
|
||||
pipelineId: 364
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rccl:
|
||||
pipelineId: 107
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rdc:
|
||||
pipelineId: 100
|
||||
developBranch: amd-staging
|
||||
pipelineId: 360
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
rocAL:
|
||||
pipelineId: 151
|
||||
|
||||
@@ -43,6 +43,7 @@ Blit
|
||||
Blockwise
|
||||
Bluefield
|
||||
Bootloader
|
||||
Broadcom
|
||||
CAS
|
||||
CCD
|
||||
CDNA
|
||||
|
||||
28
RELEASE.md
28
RELEASE.md
@@ -152,7 +152,7 @@ The release notes provide a summary of notable changes since the previous ROCm r
|
||||
- [ROCm upcoming changes](#rocm-upcoming-changes)
|
||||
|
||||
```{note}
|
||||
If you’re using AMD Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility/native_linux/native_linux_compatibility.html)
|
||||
If you’re using AMD Radeon GPUs or Ryzen APUs in a workstation setting with a display connected, see the [Use ROCm on Radeon and Ryzen](https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/index.html)
|
||||
documentation to verify compatibility and system requirements.
|
||||
```
|
||||
|
||||
@@ -249,7 +249,7 @@ AMD ROCm has officially added support for the following Deep learning and AI fra
|
||||
|
||||
#### AMD GPU Driver/ROCm packaging separation
|
||||
|
||||
The AMD GPU Driver (amdgpu) is now distributed separately from the ROCm software stack and is stored under in its own location ``/amdgpu/`` in the package repository at [repo.radeon.com](https://repo.radeon.com/amdgpu/). The first release is designated as AMD GPU Driver (amdgpu) version 30.10. See the [User and kernel-space support matrix](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html) for more information.
|
||||
The AMD GPU Driver (amdgpu) is now distributed separately from the ROCm software stack and is stored under in its own location ``/amdgpu/`` in the package repository at [repo.radeon.com](https://repo.radeon.com/amdgpu/). The first release is designated as [AMD GPU Driver (amdgpu) version 30.10](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/documentation/change-logs/30.10.1.html#amd-gpu-driver-amdgpu-30-10-release-notes). See the [User and kernel-space support matrix](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html) for more information.
|
||||
|
||||
[AMD SMI](https://github.com/ROCm/amdsmi) continues to stay with the ROCm software stack under the ROCm organization repository.
|
||||
|
||||
@@ -347,7 +347,7 @@ For more information about hipBLASLt changes, see the [hipBLASLt changelog](#hip
|
||||
|
||||
For more information about MIGraphX changes, see the [MIGraphX changelog](migraphx-2-13-0) below.
|
||||
|
||||
##### rocSHMEM Reverse Offload conduit inter-node support
|
||||
##### rocSHMEM supports Reverse Offload inter-node communication backend
|
||||
|
||||
The rocSHMEM communications library has added the RO (Reverse Offload) inter-node communication backend which enables communication between GPUs on different nodes through a NIC, using a host-based CPU proxy to forward communication orders to and from the GPU. Inter-node communication requires MPI, and is tested with Open MPI and CX7 IB NICs. For more information, see [available network backends](https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-7.0.0/install.html#available-network-backends) for installing rocSHMEM.
|
||||
|
||||
@@ -405,7 +405,7 @@ See the [ROCm Validation Suite changelog](#rocm-validation-suite-1-2-0) for more
|
||||
|
||||
##### ROCprofiler-SDK
|
||||
|
||||
###### Core SDK enhancements
|
||||
###### SDK enhancements
|
||||
|
||||
* ROCprofiler-SDK is now compatible with the HIP 7.0.0 API.
|
||||
* ROCprofiler-SDK adds support for AMD Instinct MI350X and MI355X GPUs.
|
||||
@@ -417,9 +417,7 @@ which facilitates profiling wavefronts at the instruction timing level.
|
||||
|
||||
###### rocpd
|
||||
|
||||
The ROCm Profiling Data (``rocpd``) is now the default output format for ``rocprofv3``.
|
||||
A subproject of the ROCprofiler-SDK, ``rocpd`` enables saving profiling results to a SQLite3 database, providing a structured and
|
||||
efficient foundation for analysis and post-processing.
|
||||
As a subcomponent of the ROCprofiler-SDK, ``rocpd`` enables storing the profiling results in a SQLite3 database, providing a structured and efficient foundation for analysis and post-processing. For details, see [Using rocpd Output Format](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-7.0.1/how-to/using-rocpd-output-format.html#using-rocpd-output-format).
|
||||
|
||||
###### rocprofv3 CLI tool enhancements
|
||||
|
||||
@@ -582,7 +580,7 @@ from ROCm. See [AMD GPU Driver/ROCm packaging separation](#amd-gpu-driver-rocm-p
|
||||
<td rowspan="9" style="vertical-align: middle;">ROCm 7.0.0</td>
|
||||
<td>MI355X</td>
|
||||
<td>
|
||||
01.25.13.04 (or later)<br>
|
||||
01.25.13.09 (or later)<br>
|
||||
01.25.11.02
|
||||
</td>
|
||||
<td>30.10</td>
|
||||
@@ -591,7 +589,7 @@ from ROCm. See [AMD GPU Driver/ROCm packaging separation](#amd-gpu-driver-rocm-p
|
||||
<tr>
|
||||
<td>MI350X</td>
|
||||
<td>
|
||||
01.25.13.04 (or later)<br>
|
||||
01.25.13.09 (or later)<br>
|
||||
01.25.11.02
|
||||
</td>
|
||||
<td>30.10</td>
|
||||
@@ -599,7 +597,7 @@ from ROCm. See [AMD GPU Driver/ROCm packaging separation](#amd-gpu-driver-rocm-p
|
||||
<tr>
|
||||
<td>MI325X</td>
|
||||
<td>
|
||||
01.25.04.00 (or later)<br>
|
||||
01.25.04.02 (or later)<br>
|
||||
01.25.03.03
|
||||
</td>
|
||||
<td>
|
||||
@@ -651,11 +649,11 @@ from ROCm. See [AMD GPU Driver/ROCm packaging separation](#amd-gpu-driver-rocm-p
|
||||
|
||||
New APIs introduced in AMD SMI for ROCm 7.0.0 provide additional data for the AMD Instinct products. To support these features, the following firmware for each GPUs are required:
|
||||
|
||||
* AMD Instinct MI355X - PLDM bundle 01.25.13.04
|
||||
* AMD Instinct MI355X - PLDM bundle 01.25.13.09
|
||||
|
||||
* AMD Instinct MI350X - PLDM bundle 01.25.13.04
|
||||
* AMD Instinct MI350X - PLDM bundle 01.25.13.09
|
||||
|
||||
* AMD Instinct MI325X - PLDM bundle 01.25.04.00
|
||||
* AMD Instinct MI325X - PLDM bundle 01.25.04.02
|
||||
|
||||
* AMD Instinct MI300X - PLDM bundle 01.25.03.12
|
||||
|
||||
@@ -663,7 +661,7 @@ If ROCm 7.0.0 is applied on system with prior version of PLDM bundles (firmware)
|
||||
|
||||
##### Enhanced temperature telemetry introduced in AMD SMI for MI355X and MI350X GPUs
|
||||
|
||||
AMD SMI in ROCm 7.0.0 provides support for enhanced temperature metrics and temperature anomaly detection for AMD Instinct MI350X and MI355X GPUs when paired with: PLDM bundle 01.25.13.04.
|
||||
AMD SMI in ROCm 7.0.0 provides support for enhanced temperature metrics and temperature anomaly detection for AMD Instinct MI350X and MI355X GPUs when paired with: PLDM bundle 01.25.13.09.
|
||||
|
||||
For more information on these features, see [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.0/CHANGELOG.md).
|
||||
|
||||
@@ -673,7 +671,7 @@ KVM SR-IOV support for all Instinct GPUs require the open source AMD GPU Virtual
|
||||
|
||||
##### GPU partitioning support for AMD Instinct MI355X and MI350X GPUs
|
||||
|
||||
NPS2 and DPX partitioning on bare metal is enabled on AMD Instinct MI355X and MI350X GPUs on ROCm 7.0.0 when paired with: PLDM bundle 01.25.13.04.
|
||||
NPS2 and DPX partitioning on bare metal is enabled on AMD Instinct MI355X and MI350X GPUs on ROCm 7.0.0 when paired with: PLDM bundle 01.25.13.09.
|
||||
|
||||
### ROCm components
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ ROCm Version,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
|
||||
|
||||
|
@@ -11,9 +11,8 @@ Use this matrix to view the ROCm compatibility and system requirements across su
|
||||
You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.
|
||||
|
||||
Accelerators and GPUs listed in the following table support compute workloads (no display
|
||||
information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics
|
||||
workloads, see the `Use ROCm on Radeon GPU documentation
|
||||
<https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify
|
||||
information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
|
||||
workloads, see the :docs:`Use ROCm on Radeon and Ryzen <radeon:index.html>` to verify
|
||||
compatibility and system requirements.
|
||||
|
||||
.. |br| raw:: html
|
||||
@@ -115,7 +114,7 @@ compatibility and system requirements.
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.3.42131
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.3.0
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.1/7.0.0,6.4.3,6.3.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
|
||||
@@ -90,75 +90,15 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
AMD provides preconfigured Docker images with JAX and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
|
||||
recommended way to get started with deep learning with JAX on ROCm.
|
||||
For ``jax-community`` images, see `rocm/jax-community
|
||||
<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest JAX version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: JAX Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- JAX
|
||||
- Linux
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
|
||||
|
||||
- `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
|
||||
- Ubuntu 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
|
||||
|
||||
- `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
||||
|
||||
AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_.
|
||||
|
||||
.. list-table:: JAX community Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- JAX
|
||||
- Linux
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.12.8/images/sha256-25dfaa0183e274bd0a3554a309af3249c6f16a1793226cb5373f418e39d3146a"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.12.8 <https://www.python.org/downloads/release/python-3128/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.11.11/images/sha256-ff9baeca9067d13e6c279c911e5a9e5beed0817d24fafd424367cc3d5bd381d7"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.11.11 <https://www.python.org/downloads/release/python-31111/>`_
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.10.16/images/sha256-8bab484be1713655f74da51a191ed824bb9d03db1104fd63530a1ac3c37cf7b1"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
|
||||
|
||||
- `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
|
||||
- Ubuntu 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
|
||||
To find the right image tag, see the :ref:`JAX on ROCm installation
|
||||
documentation <rocm-install-on-linux:jax-docker-support>` for a list of
|
||||
available ``rocm/jax`` images.
|
||||
|
||||
.. _key_rocm_libraries:
|
||||
|
||||
|
||||
@@ -89,141 +89,13 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
|
||||
recommended way to get started with deep learning with PyTorch on ROCm.
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: PyTorch Docker image components
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- Apex
|
||||
- torchvision
|
||||
- TensorBoard
|
||||
- MAGMA
|
||||
- UCX
|
||||
- OMPI
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-6a287591500b4048a9556c1ecc92bc411fd3d552f6c8233bc399f18eb803e8d6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-06b967629ba6657709f04169832cd769a11e6b491e8b1394c361d42d7a0c8b43"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-62022414217ef6de33ac5b1341e57db8a48e8573fa2ace12d48aa5edd4b99ef0"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-469a7f74fc149aff31797e011ee41978f6a190adc69fa423b3c6a718a77bd985"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 22.04
|
||||
- `3.11 <https://www.python.org/downloads/release/python-31113/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-37f41a1cd94019688669a1b20d33ea74156e0c129ef6b8270076ef214a6a1a2c"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-60824ba83dc1b9d94164925af1f81c0235c105dd555091ec04c57e05177ead1b"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-fe944fe083312f901be6891ab4d3ffebf2eaf2cf4f5f0f435ef0b76ec714fabd"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||
- 22.04
|
||||
- `3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||
- `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-1d59251c47170c5b8960d1172a4dbe52f5793d8966edd778f168eaf32d56661a"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
|
||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
|
||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
|
||||
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||
To find the right image tag, see the :ref:`PyTorch on ROCm installation
|
||||
documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
|
||||
available ``rocm/pytorch`` images.
|
||||
|
||||
Key ROCm libraries for PyTorch
|
||||
================================================================================
|
||||
@@ -466,7 +338,7 @@ with ROCm.
|
||||
* - Library
|
||||
- Description
|
||||
|
||||
* - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
|
||||
* - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
|
||||
- Audio and signal processing library for PyTorch. Provides utilities for
|
||||
audio I/O, signal and data processing functions, datasets, model
|
||||
implementations, and application components for audio and speech
|
||||
@@ -493,11 +365,11 @@ with ROCm.
|
||||
and popular datasets for natural language processing, including
|
||||
tokenization, vocabulary management, and text embeddings.
|
||||
|
||||
**Note:** ``torchtext`` does not implement ROCm-specific kernels.
|
||||
**Note:** ``torchtext`` does not implement ROCm-specific kernels.
|
||||
ROCm acceleration is provided through the underlying PyTorch framework
|
||||
and ROCm library integration. Only official release exists.
|
||||
|
||||
* - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
|
||||
* - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
|
||||
- Beta library of common modular data loading primitives for easily
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
in prototype stage.
|
||||
@@ -599,7 +471,7 @@ Known issues and notes for PyTorch 2.7 with ROCm 7.0
|
||||
================================================================================
|
||||
|
||||
- The ``matmul.allow_fp16_reduced_precision_reduction`` and
|
||||
``matmul.allow_bf16_reduced_precision_reduction`` options under
|
||||
``torch.backends.cuda`` are not supported. As a result,
|
||||
``matmul.allow_bf16_reduced_precision_reduction`` options under
|
||||
``torch.backends.cuda`` are not supported. As a result,
|
||||
reduced-precision reductions using FP16 or BF16 accumulation types are not
|
||||
available.
|
||||
|
||||
@@ -47,80 +47,15 @@ fixes, updates, and support for the latest ROCM versions.
|
||||
.. _tensorflow-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
===============================================================================
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
AMD provides preconfigured Docker images with TensorFlow and the ROCm backend.
|
||||
These images are published on `Docker Hub <https://hub.docker.com/r/rocm/tensorflow>`__ and are the
|
||||
recommended way to get started with deep learning with TensorFlow on ROCm.
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `TensorFlow images
|
||||
<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
|
||||
Docker Hub. The following Docker image tags and associated inventories are
|
||||
validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
|
||||
the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table:: TensorFlow Docker image components
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- TensorFlow
|
||||
- Ubuntu
|
||||
- Python
|
||||
- TensorBoard
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 24.04
|
||||
- `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
|
||||
|
||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/>`__
|
||||
- 22.04
|
||||
- `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
|
||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||
To find the right image tag, see the :ref:`TensorFlow on ROCm installation
|
||||
documentation <rocm-install-on-linux:tensorflow-docker-support>` for a list of
|
||||
available ``rocm/tensorflow`` images.
|
||||
|
||||
|
||||
Critical ROCm libraries for TensorFlow
|
||||
|
||||
@@ -114,7 +114,10 @@ article_pages = [
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
@@ -127,7 +130,9 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||
|
||||
@@ -1,12 +1,4 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
@@ -15,6 +7,14 @@ dockers:
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.1.0-499ece1c21
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
- pull_tag: rocm/megatron-lm:v25.8_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
ROCm: 6.4.3
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
hipBLASLt: d1b517fc7a
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 3.1 70B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -0,0 +1,58 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
config_name: llama3.3_70B-pretrain.yaml
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
config_name: llama3.1_70B-pretrain.yaml
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
config_name: llama3.1_8B-pretrain.yaml
|
||||
- model: Llama 2 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||
config_name: llama2_7B-pretrain.yaml
|
||||
- model: Llama 2 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||
config_name: llama2_70B-pretrain.yaml
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
config_name: deepseek_v3-pretrain.yaml
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
config_name: deepseek_v2_lite-pretrain.yaml
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
config_name: qwen2.5_72B-pretrain.yaml
|
||||
@@ -1,13 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||
- pull_tag: rocm/megatron-lm:v25.8_py310
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
Primus: v0.1.0-rc1
|
||||
ROCm: 6.4.3
|
||||
Primus: 927a717
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.1.0.dev0+ba586519
|
||||
hipBLASLt: 37ba1d36
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
hipBLASLt: d1b517fc7a
|
||||
Triton: 3.3.0
|
||||
RCCL: 2.22.3
|
||||
model_groups:
|
||||
|
||||
@@ -120,7 +120,7 @@ vLLM inference performance testing
|
||||
==================================
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker/README.md>`__.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
@@ -16,7 +16,7 @@ PyTorch inference performance testing
|
||||
|
||||
The `ROCm PyTorch Docker <https://hub.docker.com/r/rocm/pytorch/tags>`_ image offers a prebuilt,
|
||||
optimized environment for testing model inference performance on AMD Instinct™ MI300X series
|
||||
accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
|
||||
GPUs. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
|
||||
tool with the ROCm PyTorch container to test inference performance on various models efficiently.
|
||||
|
||||
.. _pytorch-inference-benchmark-available-models:
|
||||
@@ -175,7 +175,7 @@ Further reading
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`../../inference-optimization/workload`.
|
||||
|
||||
@@ -23,7 +23,7 @@ improved efficiency and throughput.
|
||||
serving engine for large language models (LLMs) and vision models. The
|
||||
ROCm-enabled `SGLang base Docker image <{{ docker.docker_hub_url }}>`__
|
||||
bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X series
|
||||
accelerators. It includes the following software components:
|
||||
GPUs. It includes the following software components:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -37,7 +37,7 @@ improved efficiency and throughput.
|
||||
{% endfor %}
|
||||
|
||||
The following guides on setting up and running SGLang and Mooncake for disaggregated
|
||||
distributed inference on a Slurm cluster using AMD Instinct MI300X series accelerators backed by
|
||||
distributed inference on a Slurm cluster using AMD Instinct MI300X series GPUs backed by
|
||||
Mellanox CX-7 NICs.
|
||||
|
||||
Prerequisites
|
||||
@@ -111,7 +111,7 @@ Build the Docker image
|
||||
----------------------
|
||||
|
||||
Get the Dockerfile located in
|
||||
`<https://github.com/ROCm/MAD/blob/develop/docker/sglang_dissag_inference.ubuntu.amd.Dockerfile>`__.
|
||||
`<https://github.com/ROCm/MAD/blob/develop/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile>`__.
|
||||
It uses `lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
|
||||
<https://hub.docker.com/layers/lmsysorg/sglang/v0.4.9.post1-rocm630/images/sha256-2f6b1748e4bcc70717875a7da76c87795fd8aa46a9646e08d38aa7232fc78538>`__
|
||||
as the base Docker image and installs the necessary components for Mooncake, etcd, and Mellanox network
|
||||
@@ -128,7 +128,7 @@ drivers.
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
The `<https://github.com/ROCm/MAD/tree/develop/scripts/sglang_dissag>`__
|
||||
The `<https://github.com/ROCm/MAD/tree/develop/scripts/sglang_disagg>`__
|
||||
repository contains scripts to launch SGLang inference with prefill/decode
|
||||
disaggregation via Mooncake for supported models.
|
||||
|
||||
@@ -236,7 +236,7 @@ Further reading
|
||||
- See the base upstream Docker image on `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
|
||||
MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
@@ -14,9 +14,9 @@ vLLM inference performance testing
|
||||
|
||||
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
|
||||
a prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||
inference performance on AMD Instinct™ MI300X series GPUs. This ROCm vLLM
|
||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||
accelerators and includes the following components:
|
||||
GPUs and includes the following components:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -31,7 +31,7 @@ vLLM inference performance testing
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-909>` for
|
||||
MI300X series accelerators.
|
||||
MI300X series GPUs.
|
||||
|
||||
What's new
|
||||
==========
|
||||
@@ -101,7 +101,7 @@ Supported models
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
@@ -121,7 +121,7 @@ page provides reference throughput and serving measurements for inferencing popu
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -230,7 +230,7 @@ system's configuration.
|
||||
.. seealso::
|
||||
|
||||
For more information on configuration, see the `config files
|
||||
<https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
|
||||
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||
for descriptions of available configuration options
|
||||
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||
@@ -352,6 +352,9 @@ system's configuration.
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
|
||||
try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
@@ -420,7 +423,7 @@ Further reading
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
a brief introduction to vLLM and optimization strategies.
|
||||
|
||||
@@ -47,7 +47,7 @@ Deep learning frameworks
|
||||
========================
|
||||
|
||||
ROCm supports deep learning frameworks and libraries including `PyTorch
|
||||
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
||||
<https://pytorch.org>`_, `TensorFlow
|
||||
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.
|
||||
|
||||
Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
||||
@@ -57,4 +57,4 @@ Next steps
|
||||
==========
|
||||
|
||||
After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
|
||||
to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
|
||||
to test the optimal performance of your AMD hardware. See :doc:`system-setup/index` to get started.
|
||||
|
||||
40
docs/how-to/rocm-for-ai/system-setup/index.rst
Normal file
40
docs/how-to/rocm-for-ai/system-setup/index.rst
Normal file
@@ -0,0 +1,40 @@
|
||||
.. meta::
|
||||
:description: System setup and validation steps for AI training and inference on ROCm
|
||||
:keywords: AMD Instinct, ROCm, GPU, AI, training, inference, benchmarking, performance, validation
|
||||
|
||||
*************************************
|
||||
System setup for AI workloads on ROCm
|
||||
*************************************
|
||||
|
||||
Before you begin training or inference on AMD Instinct™ GPUs, complete
|
||||
the following system setup and validation steps to ensure optimal performance.
|
||||
|
||||
Prerequisite system validation
|
||||
==============================
|
||||
|
||||
First, confirm that your system meets all software and hardware prerequisites.
|
||||
See :doc:`prerequisite-system-validation`.
|
||||
|
||||
Docker images for AMD Instinct GPUs
|
||||
===================================
|
||||
|
||||
AMD provides prebuilt Docker images for AMD Instinct™ MI300X and MI325X
|
||||
GPUs. These images include ROCm-enabled deep learning frameworks and
|
||||
essential software components. They support single-node and multi-node configurations
|
||||
and are ready for training and inference workloads out of the box.
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
For instructions on enabling multi-node training, see :doc:`multi-node-setup`.
|
||||
|
||||
System optimization and validation
|
||||
==================================
|
||||
|
||||
Before running workloads, verify that the system is configured correctly and
|
||||
operating at peak efficiency. Recommended steps include:
|
||||
|
||||
- Disabling NUMA auto-balancing
|
||||
- Running system benchmarks to validate hardware performance
|
||||
|
||||
For details on running system health checks, see :doc:`system-health-check`.
|
||||
320
docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
Normal file
320
docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
Normal file
@@ -0,0 +1,320 @@
|
||||
.. meta::
|
||||
:description: Multi-node setup for AI training
|
||||
:keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training
|
||||
|
||||
.. _rocm-for-ai-multi-node-setup:
|
||||
|
||||
*********************************
|
||||
Multi-node setup for AI workloads
|
||||
*********************************
|
||||
|
||||
AMD provides ready-to-use Docker images for AMD Instinct™ MI300X and MI325X
|
||||
GPUs containing ROCm-capable deep learning frameworks and essential
|
||||
software components. These Docker images can run and leverage multiple nodes if
|
||||
they are available. This page describes how to enable the multi-node training
|
||||
of AI workloads on AMD Instinct GPUs.
|
||||
|
||||
Prerequisites
|
||||
=============
|
||||
|
||||
Before starting, ensure your environment meets the following requirements:
|
||||
|
||||
* Multi-node networking: your cluster should have a configured multi-node network. For setup
|
||||
instructions, see the `Multi-node network configuration for AMD Instinct
|
||||
accelerators
|
||||
<https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
|
||||
guide in the Instinct documentation.
|
||||
|
||||
* ROCm Docker container to simplify environment setup for AI workloads. See the following resources to get started:
|
||||
|
||||
* :doc:`Training a model with Megatron-LM and ROCm <../training/benchmark-docker/megatron-lm>`
|
||||
|
||||
* :doc:`Training a model with PyTorch and ROCm <../training/benchmark-docker/pytorch-training>`
|
||||
|
||||
* :doc:`Training a model with JAX MaxText and ROCm <../training/benchmark-docker/jax-maxtext>`
|
||||
|
||||
* Slurm workload manager to run the :ref:`provided examples <multi-node-setup-training-examples>`.
|
||||
|
||||
Install required packages
|
||||
=========================
|
||||
|
||||
To run multi-node workloads, ensure you have all the required packages installed based on your
|
||||
network device. For example, on Ubuntu systems:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
apt install -y iproute2
|
||||
|
||||
apt install -y linux-headers-"$(uname -r)" libelf-dev
|
||||
|
||||
apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
|
||||
|
||||
Compile and install the RoCE library
|
||||
------------------------------------
|
||||
|
||||
If you're using Broadcom NICs, you need to compile and install the RoCE (RDMA
|
||||
over Converged Ethernet) library. See `RoCE cluster network configuration guide
|
||||
for AMD Instinct accelerators
|
||||
<https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html#roce-cluster-network-configuration-guide-for-amd-instinct-accelerators>`__
|
||||
for more information.
|
||||
|
||||
See the `Ethernet networking guide for AMD
|
||||
Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source
|
||||
<https://docs.broadcom.com/doc/957608-AN2XX#page=81>`_ for more details.
|
||||
|
||||
.. important::
|
||||
|
||||
It is crucial to install the exact same version of the RoCE library that
|
||||
is installed on your host system. Also, ensure that the path to these
|
||||
libraries on the host is correctly mounted into your Docker container.
|
||||
Failure to do so can lead to compatibility issues and communication
|
||||
failures.
|
||||
|
||||
1. Set ``BUILD_DIR`` to the path on the host system where the Broadcom drivers and ``bnxt_rocelib`` source are located.
|
||||
Then, navigate to the ``bnxt_rocelib`` directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export BUILD_DIR=/path/to/your/broadcom_drivers_on_host
|
||||
cd $BUILD_DIR/drivers_linux/bnxt_rocelib/
|
||||
|
||||
2. The ``bnxt_rocelib`` directory contains a version of ``libbnxt_re`` in a zipped ``.tar.gz`` file.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
tar -xf libbnxt_re-a.b.c.d.tar.gz
|
||||
cd libbnxt_re-a.b.c.d
|
||||
|
||||
3. Compile and install the RoCE library.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sh autogen.sh
|
||||
./configure
|
||||
make
|
||||
find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \;
|
||||
make install all
|
||||
sh -c "echo /usr/local/lib >> /etc/ld.so.conf"
|
||||
ldconfig
|
||||
cp -f bnxt_re.driver /etc/libibverbs.d/
|
||||
find . -name "*.so" -exec md5sum {} \;
|
||||
BUILT_MD5SUM=$(find . -name "libbnxt_re-rdmav*.so" -exec md5sum {} \; | cut -d " " -f 1)
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
Before running multi-node workloads, set these essential environment variables:
|
||||
|
||||
Master address
|
||||
--------------
|
||||
|
||||
By default, ``localhost`` is used for single-node configurations. Change
|
||||
``localhost`` to the master node's resolvable hostname or IP address:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
Number of nodes
|
||||
---------------
|
||||
|
||||
Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NNODES="${NNODES:-<num_nodes>}"
|
||||
|
||||
Node ranks
|
||||
----------
|
||||
|
||||
Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on).
|
||||
Node ranks should be unique across all nodes in the cluster.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NODE_RANK="${NODE_RANK:-<node_rank>}"
|
||||
|
||||
Network interface
|
||||
-----------------
|
||||
|
||||
Update the network interface in the script to match your system's network interface. To
|
||||
find your network interface, run the following (outside of any Docker container):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ip a
|
||||
|
||||
Look for an active interface (status "UP") with an IP address in the same subnet as
|
||||
your other nodes. Then, update the following variable in the script, for
|
||||
example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
This variable specifies which network interface to use for inter-node communication.
|
||||
Setting this variable to the incorrect interface can result in communication failures
|
||||
or significantly reduced performance.
|
||||
|
||||
.. tip::
|
||||
|
||||
This command sets ``NCCL_SOCKET_IFNAME``'s value to the last RDMA interface.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_SOCKET_IFNAME=$(rdma link show | awk '{print $NF}' | sort | tail -n1)
|
||||
|
||||
RDMA/IB interface
|
||||
-----------------
|
||||
|
||||
Set the RDMA interfaces to be used for communication. NICs can come from different vendors and the names of the RDMA interface can be different. To get the list of all the RDMA/IB devices, run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ibv_devices
|
||||
|
||||
The command below gets the list of all RDMA/IB devices and puts them in a
|
||||
comma-separated format. If
|
||||
(``rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7``) are your RDMA
|
||||
interfaces, then set:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# If using Broadcom NIC
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
# If using Mellanox NIC
|
||||
# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||
|
||||
.. tip::
|
||||
|
||||
Alternatively, if you want to choose the RDMA interface automatically, you
|
||||
can use the following. This command will sort the RDMA interfaces and then
|
||||
select the first eight RDMA interfaces.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_IB_HCA=$(ibv_devices | awk 'NR>2 {print $1}' | sort | head -n 8 | paste -sd,)
|
||||
|
||||
Global ID index
|
||||
---------------
|
||||
|
||||
Update the global ID index if you're using RoCE.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
|
||||
.. _multi-node-setup-training-examples:
|
||||
|
||||
Multi-node training examples
|
||||
============================
|
||||
|
||||
The following examples use the Slurm workload manager to launch jobs on
|
||||
multiple nodes. To run these scripts as-is, you must have a Slurm environment
|
||||
configured. The scripts are designed to work with both Broadcom Thor 2 and
|
||||
Mellanox NICs by automatically installing the required libraries and setting
|
||||
the necessary environment variables. For systems with Broadcom NICs, the
|
||||
scripts assume the host's RoCE library is located in the ``/opt`` directory.
|
||||
|
||||
The following benchmarking examples demonstrate the training of a Llama 3 8B model
|
||||
across multiple 8-GPU nodes, using FSDP for intra-node parallelism and DP for
|
||||
inter-node parallelism.
|
||||
|
||||
.. _rocm-for-ai-multi-node-setup-jax-train-example:
|
||||
|
||||
JAX MaxText
|
||||
-----------
|
||||
|
||||
1. Download the desired multi-node benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/jax-maxtext/gpu-rocm/llama3_8b_multinode.sh
|
||||
|
||||
Or clone the `<https://github.com/ROCm/MAD>`__ repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd scripts/jax-maxtext/gpu-rocm
|
||||
|
||||
2. Run the benchmark for multi-node training.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama3_8b_multinode.sh
|
||||
|
||||
.. _rocm-for-ai-multi-node-setup-pyt-train-example:
|
||||
|
||||
PyTorch training
|
||||
----------------
|
||||
|
||||
.. note::
|
||||
|
||||
The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
|
||||
with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
|
||||
following example refers to the legacy workflow :ref:`Training a
|
||||
model with PyTorch <amd-pytorch-training-multinode-examples>`.
|
||||
|
||||
1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/pytorch_train/run_multinode_train.sh
|
||||
|
||||
Or clone the `<https://github.com/ROCm/MAD>`__ repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd scripts/pytorch_train
|
||||
|
||||
2. Run the benchmark for multi-node training.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> run_multinode_train.sh
|
||||
|
||||
.. seealso::
|
||||
|
||||
See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
|
||||
|
||||
Megatron-LM
|
||||
-----------
|
||||
|
||||
.. note::
|
||||
|
||||
The Megatron-LM Docker image now focuses on :ref:`Training a model with
|
||||
Primus and Megatron <amd-primus-megatron-multi-node-examples>`. The
|
||||
following example refers to the legacy Megatron-LM :ref:`Training a model
|
||||
with Megatron-LM <amd-megatron-lm-multi-node-examples>` and might have
|
||||
limited support.
|
||||
|
||||
1. Download the ``train_llama_slurm.sh`` benchmarking script from
|
||||
`<https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama_slurm.sh>`__.
|
||||
|
||||
2. Set the network interface parameters as per the above guidelines and run the script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd </path/to/your/Megatron-LM>
|
||||
export NETWORK_INTERFACE=$NCCL_SOCKET_IFNAME
|
||||
export NCCL_IB_HCA=$NCCL_IB_HCA
|
||||
export IMAGE=docker.io/rocm/megatron-lm:latest OR your preferred image
|
||||
export DATA_CACHE_PATH=/nfs/mounted/repo
|
||||
|
||||
sbatch –N <num_nodes> examples/llama/train_llama_slurm.sh <MODEL_SIZE> <MBS> <GBS> <SEQ_LENGTH> <FSDP> <RECOMPUTE>
|
||||
|
||||
2. For example, to run a Llama 3 8B workload in BF16 precision, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MODEL_NAME=llama3 sbatch –N 8 examples/llama/train_llama_slurm.sh 8 2 128 8192 0 0
|
||||
# Other parameters, such as TP, FP8 datatype, can be adjusted in the script.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
* `Multi-node network configuration for AMD Instinct accelerators <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
|
||||
|
||||
* `Ethernet networking guide for AMD Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#page=81>`__
|
||||
@@ -1,5 +1,3 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Prerequisite system validation before using ROCm for AI.
|
||||
:keywords: ROCm, AI, LLM, train, megatron, Llama, tutorial, docker, torch, pytorch, jax
|
||||
@@ -1,12 +1,14 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
|
||||
:keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference
|
||||
|
||||
.. _rocm-for-ai-system-health-bench:
|
||||
|
||||
************************
|
||||
System health benchmarks
|
||||
************************
|
||||
*****************************************
|
||||
System health benchmarks for AI workloads
|
||||
*****************************************
|
||||
|
||||
Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).
|
||||
|
||||
@@ -31,7 +33,7 @@ installed, run the following command:
|
||||
sudo apt install rocm-validation-suite
|
||||
|
||||
See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
|
||||
and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
|
||||
and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html>`_
|
||||
in the Instinct documentation for more detailed instructions.
|
||||
|
||||
Benchmark, stress, and qualification tests
|
||||
@@ -41,7 +43,7 @@ The GPU stress test runs various GEMM computations as workloads to stress the GP
|
||||
meets the configured target GFLOPS.
|
||||
|
||||
Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#benchmark-stress-qualification>`_
|
||||
section of the Instinct documentation for usage instructions.
|
||||
|
||||
BabelStream test
|
||||
@@ -53,7 +55,7 @@ BabelStream tests are included with the RVS package as part of the `BABEL module
|
||||
<https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.
|
||||
|
||||
For more information, see `Performance benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#babelstream>`_
|
||||
in the Instinct documentation.
|
||||
|
||||
RCCL tests
|
||||
@@ -62,7 +64,7 @@ RCCL tests
|
||||
The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
|
||||
communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
|
||||
the performance and verifies the correctness of these collective operations.
|
||||
This helps ensure optimal scaling for multi-accelerator tasks.
|
||||
This helps ensure optimal scaling for multi-GPU tasks.
|
||||
|
||||
1. To get started, build RCCL-tests using the official instructions in the README at
|
||||
`<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
|
||||
@@ -75,8 +77,8 @@ This helps ensure optimal scaling for multi-accelerator tasks.
|
||||
make
|
||||
|
||||
2. Run the suggested RCCL tests -- see `RCCL benchmarking
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
|
||||
in the Instinct performance benchmarking documentation for instructions.
|
||||
<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/rdma-benchmarking.html#rccl-benchmarking-results>`_
|
||||
in the AMD Instinct customer acceptance guide.
|
||||
|
||||
TransferBench test
|
||||
==================
|
||||
@@ -10,10 +10,10 @@ MaxText is a high-performance, open-source framework built on the Google JAX
|
||||
machine learning library to train LLMs at scale. The MaxText framework for
|
||||
ROCm is an optimized fork of the upstream
|
||||
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
||||
on AMD MI300X series accelerators.
|
||||
on AMD MI300X series GPUs.
|
||||
|
||||
The MaxText for ROCm training Docker image
|
||||
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
||||
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X GPUs,
|
||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||
It includes the following software components:
|
||||
|
||||
@@ -47,10 +47,6 @@ It includes the following software components:
|
||||
``shardy=False`` during the training run. You can also follow the `migration
|
||||
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
|
||||
it.
|
||||
|
||||
The provided multi-node training scripts in this documentation are
|
||||
not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
|
||||
Docker image.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
@@ -73,7 +69,7 @@ Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300
|
||||
series accelerators. Some instructions, commands, and available training
|
||||
series GPUs. Some instructions, commands, and available training
|
||||
configurations in this documentation might vary by model -- select one to get
|
||||
started.
|
||||
|
||||
@@ -138,85 +134,11 @@ doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
.. _amd-maxtext-multi-node-setup-v257:
|
||||
|
||||
Multi-node setup
|
||||
----------------
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
|
||||
For multi-node environments, ensure you have all the necessary packages for
|
||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
|
||||
|
||||
1. Install the following packages to build and install the RDMA driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install iproute2 -y
|
||||
sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
|
||||
sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
|
||||
|
||||
Refer to your NIC manufacturer's documentation for further steps on
|
||||
compiling and installing the RoCE driver. For example, for Broadcom,
|
||||
see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
|
||||
in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
|
||||
|
||||
2. Set the following environment variables.
|
||||
|
||||
a. Master address
|
||||
|
||||
Change ``localhost`` to the master node's resolvable hostname or IP address:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
b. Number of nodes
|
||||
|
||||
Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NNODES="${NNODES:-1}"
|
||||
|
||||
c. Node ranks
|
||||
|
||||
Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
|
||||
Node ranks should be unique across all nodes in the cluster.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
d. Network interface
|
||||
|
||||
Update the network interface in the script to match your system's network interface. To
|
||||
find your network interface, run the following (outside of any Docker container):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ip a
|
||||
|
||||
Look for an active interface with an IP address in the same subnet as
|
||||
your other nodes. Then, update the following variable in the script, for
|
||||
example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
This variable specifies which network interface to use for inter-node communication.
|
||||
Setting this variable to the incorrect interface can result in communication failures
|
||||
or significantly reduced performance.
|
||||
|
||||
e. RDMA interface
|
||||
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
|
||||
Then, set the RDMA interfaces to use for communication.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# If using Broadcom NIC
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
# If using Mellanox NIC
|
||||
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||
See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
|
||||
environment for multi-node training.
|
||||
|
||||
.. _amd-maxtext-get-started-v257:
|
||||
|
||||
@@ -361,12 +283,6 @@ benchmark results:
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
|
||||
|
||||
.. important::
|
||||
|
||||
Quantized training is not supported with the JAX 0.6.0 Docker image; support
|
||||
will be added in a future release. For quantized training, use the JAX 0.5.0
|
||||
Docker image: ``rocm/jax-training:maxtext-v25.7``.
|
||||
|
||||
{% endif %}
|
||||
{% if model.multinode_training_script and "multi-node" in model.doc_options %}
|
||||
.. rubric:: Multi-node training
|
||||
@@ -379,11 +295,11 @@ benchmark results:
|
||||
benchmark. Run them outside of any Docker container.
|
||||
|
||||
1. Make sure ``$HF_HOME`` is set before running the test. See
|
||||
`ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
|
||||
`ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
|
||||
for more details on downloading the Llama models before running the
|
||||
benchmark.
|
||||
|
||||
2. To run multi-node training for {{ model.model }},
|
||||
2. To run multi-node training for {{ model.model }},
|
||||
use the
|
||||
`multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
|
||||
under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
|
||||
@@ -409,7 +325,7 @@ Further reading
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
@@ -10,20 +10,20 @@ Training a model with Megatron-LM on ROCm
|
||||
|
||||
.. caution::
|
||||
|
||||
Primus with Megatron supersedes this ROCm Megatron-LM training workflow.
|
||||
Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
|
||||
To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
|
||||
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||
|
||||
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||
Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
|
||||
Instinct™ MI300X series GPUs, Megatron-LM delivers enhanced
|
||||
scalability, performance, and resource utilization for AI workloads. It is
|
||||
purpose-built to support models like Llama, DeepSeek, and Mixtral,
|
||||
enabling developers to train next-generation AI models more
|
||||
efficiently.
|
||||
|
||||
AMD provides ready-to-use Docker images for MI300X series accelerators containing
|
||||
AMD provides ready-to-use Docker images for MI300X series GPUs containing
|
||||
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||
utilities. It contains the following software components to accelerate training
|
||||
workloads:
|
||||
@@ -61,7 +61,7 @@ workloads:
|
||||
================
|
||||
|
||||
The following models are supported for training performance benchmarking with Megatron-LM and ROCm
|
||||
on AMD Instinct MI300X series accelerators.
|
||||
on AMD Instinct MI300X series GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
@@ -115,7 +115,7 @@ popular AI models.
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
|
||||
only reflects the latest version of this training benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -138,11 +138,11 @@ Environment setup
|
||||
=================
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
|
||||
reproduce the benchmark results on MI300X series GPUs with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
@@ -152,7 +152,7 @@ Download the Docker image
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
{% if dockers|length > 1 %}
|
||||
.. tab-set::
|
||||
.. tab-set::
|
||||
|
||||
{% for docker in data.dockers %}
|
||||
.. tab-item:: {{ docker.doc_name }}
|
||||
@@ -281,25 +281,11 @@ Configuration
|
||||
|
||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||
|
||||
Network interface
|
||||
-----------------
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
|
||||
Update the network interface in the script to match your system's network interface. To
|
||||
find your network interface, run the following (outside of any Docker container):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ip a
|
||||
|
||||
Look for an active interface that has an IP address in the same subnet as
|
||||
your other nodes. Then, update the following variables in the script, for
|
||||
example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
|
||||
|
||||
.. _amd-megatron-lm-tokenizer:
|
||||
|
||||
@@ -540,46 +526,6 @@ Download the dataset
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
|
||||
If you're running multi-node training, update the following environment variables. They can
|
||||
also be passed as command line arguments. Refer to the following example configurations.
|
||||
|
||||
* Change ``localhost`` to the master node's hostname:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES="${NNODES:-1}"
|
||||
|
||||
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
|
||||
NFS directory) for multi-node runs:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
|
||||
|
||||
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
|
||||
inside a Docker container, either install the drivers inside the Docker container or pass the network
|
||||
drivers from the host while creating the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Specify which RDMA interfaces to use for communication
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
|
||||
.. _amd-megatron-lm-run-training:
|
||||
|
||||
Run training
|
||||
@@ -587,7 +533,7 @@ Run training
|
||||
|
||||
Use the following example commands to set up the environment, configure
|
||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
|
||||
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||
MI300X series GPUs with the AMD Megatron-LM environment.
|
||||
|
||||
Single node training
|
||||
--------------------
|
||||
@@ -612,7 +558,7 @@ Single node training
|
||||
FSDP=1 \
|
||||
MODEL_SIZE=70 \
|
||||
TOTAL_ITERS=50 \
|
||||
bash examples/llama/train_llama3.sh
|
||||
bash examples/llama/train_llama3.sh
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -770,7 +716,7 @@ Single node training
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||
navigate to the Megatron-LM folder and use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -925,6 +871,8 @@ Single node training
|
||||
RECOMPUTE_ACTIVATIONS=full \
|
||||
CKPT_FORMAT=torch_dist
|
||||
|
||||
.. _amd-megatron-lm-multi-node-examples:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
|
||||
|
||||
@@ -202,16 +202,14 @@ Getting started
|
||||
|
||||
The following examples demonstrate how to get started with single node
|
||||
and multi-node training using the benchmarking scripts provided at
|
||||
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
||||
`<https://github.com/ROCm/maxtext/>`__.
|
||||
|
||||
.. important::
|
||||
|
||||
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
||||
|
||||
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
||||
set correctly and points to your Hugging Face cache directory. Refer to the
|
||||
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
||||
for more detailed instructions.
|
||||
set correctly and points to your Hugging Face cache directory.
|
||||
|
||||
Single node training benchmarking examples
|
||||
------------------------------------------
|
||||
|
||||
@@ -213,16 +213,14 @@ Getting started
|
||||
|
||||
The following examples demonstrate how to get started with single node
|
||||
and multi-node training using the benchmarking scripts provided at
|
||||
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
||||
`<https://github.com/ROCm/maxtext/>`__.
|
||||
|
||||
.. important::
|
||||
|
||||
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
||||
|
||||
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
||||
set correctly and points to your Hugging Face cache directory. Refer to the
|
||||
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
||||
for more detailed instructions.
|
||||
set correctly and points to your Hugging Face cache directory.
|
||||
|
||||
Single node training benchmarking examples
|
||||
------------------------------------------
|
||||
|
||||
@@ -16,12 +16,22 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.7 (latest)
|
||||
* - v25.8 (latest)
|
||||
-
|
||||
* ROCm
|
||||
* PyTorch
|
||||
* ROCm 6.4.3
|
||||
* PyTorch 2.8.0a0+gitd06a406
|
||||
-
|
||||
* :doc:`Documentation <../megatron-lm>`
|
||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||
* `Docker Hub (py310) <https://hub.docker.com/r/rocm/megatron-lm/tags>`__
|
||||
|
||||
* - v25.7
|
||||
-
|
||||
* ROCm 6.4.2
|
||||
* PyTorch 2.8.0a0+gitd06a406
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <primus-megatron-v25.7>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.7>`
|
||||
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
|
||||
|
||||
* - v25.6
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
:orphan:
|
||||
|
||||
**********************************************************************
|
||||
Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
|
||||
**********************************************************************
|
||||
*****************************************************************
|
||||
Migrating workloads to Primus (Megatron backend) from Megatron-LM
|
||||
*****************************************************************
|
||||
|
||||
Primus supports Megatron-Core as backend optimization library,
|
||||
replacing ROCm Megatron-LM. This document outlines the steps to migrate
|
||||
workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
|
||||
workload from ROCm Megatron-LM to Primus with the Megatron backend.
|
||||
|
||||
Model architecture
|
||||
==================
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,604 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using Megatron-LM for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
********************************************
|
||||
Training a model with Primus and Megatron-LM
|
||||
********************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm Megatron-LM
|
||||
training performance documentation. See :doc:`../primus-megatron` for the latest version.
|
||||
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||
Primus is backend-agnostic and supports multiple training engines -- including Megatron.
|
||||
|
||||
.. note::
|
||||
|
||||
Primus with the Megatron backend is intended to replace ROCm
|
||||
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||
workloads from Megatron-LM to Primus with Megatron, see
|
||||
:doc:`megatron-lm-primus-migration-guide`.
|
||||
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||
containing essential components for Primus and Megatron-LM.
|
||||
|
||||
.. note::
|
||||
|
||||
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <megatron-lm-v25.6>`.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = dockers[0] %}
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-megatron-lm-model-support-v257:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
Some instructions, commands, and training examples in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-primus-megatron-lm-training-v257:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = dockers[0] %}
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||
|
||||
.. _amd-primus-megatron-lm-requirements-v257:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
--shm-size 128G \
|
||||
--name primus_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
|
||||
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||
|
||||
.. _amd-primus-megatron-lm-environment-setup-v257:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
Primus defines a training configuration in YAML for each model in
|
||||
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
|
||||
Note that training configuration YAML files for other models follow this naming convention.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||
|
||||
Dataset options
|
||||
---------------
|
||||
|
||||
You can use either mock data or real data for training.
|
||||
|
||||
* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
|
||||
value is ``true`` for enabled.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
mock_data: true
|
||||
|
||||
* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
mock_data: false
|
||||
train_data_path: /path/to/your/dataset
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. _amd-primus-megatron-lm-tokenizer-v257:
|
||||
|
||||
Tokenizer
|
||||
---------
|
||||
|
||||
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||
right permissions to access the tokenizer for each model.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Export your HF_TOKEN in the workspace
|
||||
export HF_TOKEN=<your_hftoken>
|
||||
|
||||
.. _amd-primus-megatron-lm-run-training-v257:
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Use the following example commands to set up the environment, configure
|
||||
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||
|
||||
Single node training
|
||||
--------------------
|
||||
|
||||
To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pip install -r requirements.txt
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
|
||||
To run pre-training for Llama 3.3 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 16 \
|
||||
--train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
To run pre-training for Llama 3.1 8B FP8, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
|
||||
For Llama 3.1 8B BF16, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
|
||||
To run pre-training for Llama 3.1 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 40 \
|
||||
--fp8 hybrid \
|
||||
--no_fp8_weight_transpose_cache true
|
||||
|
||||
.. note::
|
||||
|
||||
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||
|
||||
To run pre-training for Llama 2 7B FP8, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
|
||||
To run pre-training for Llama 2 7B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
To run pre-training for Llama 2 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 3 \
|
||||
--moe_layer_freq 1 \
|
||||
--train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
|
||||
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--global_batch_size 256 \
|
||||
--train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 4 \
|
||||
--pipeline_model_parallel_size 1 \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 16 \
|
||||
--train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
|
||||
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||
command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
For FP8, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
|
||||
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
|
||||
To run training on multiple nodes, you can use the
|
||||
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = dockers[0] %}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/Primus/
|
||||
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||
export HF_TOKEN=<your_HF_token>
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||
|
||||
.. note::
|
||||
|
||||
* Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
|
||||
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||
* To find your network interface, you can use ``ip a``.
|
||||
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
|
||||
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 12
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--global_batch_size 1024 \
|
||||
--fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
|
||||
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 12
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||
|
||||
To train Llama 2 8B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 10 \
|
||||
--global_batch_size 640 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 1536 \
|
||||
--recompute_num_layers 12
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
|
||||
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 256
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
|
||||
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 8 \
|
||||
--global_batch_size 512 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v257:
|
||||
|
||||
Key options
|
||||
-----------
|
||||
|
||||
The following are key options to take note of
|
||||
|
||||
fp8
|
||||
``hybrid`` enables FP8 GEMMs.
|
||||
|
||||
use_torch_fsdp2
|
||||
``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled,
|
||||
set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
|
||||
|
||||
profile
|
||||
To enable PyTorch profiling, set these parameters:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
profile: true
|
||||
use_pytorch_profiler: true
|
||||
profile_step_end: 7
|
||||
profile_step_start: 6
|
||||
|
||||
train_iters
|
||||
The total number of iterations (default: 50).
|
||||
|
||||
mock_data
|
||||
True by default.
|
||||
|
||||
micro_batch_size
|
||||
Micro batch size.
|
||||
|
||||
global_batch_size
|
||||
Global batch size.
|
||||
|
||||
recompute_granularity
|
||||
For activation checkpointing.
|
||||
|
||||
num_layers
|
||||
For using a reduced number of layers as with proxy models.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`megatron-lm-history` to find documentation for previous releases
|
||||
of the ``ROCm/megatron-lm`` Docker image.
|
||||
@@ -2,24 +2,25 @@
|
||||
:description: How to train a model using Megatron-LM for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||
|
||||
**********************************************
|
||||
Training a model with Primus and Megatron-Core
|
||||
**********************************************
|
||||
********************************************
|
||||
Training a model with Primus and Megatron-LM
|
||||
********************************************
|
||||
|
||||
`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||
Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
|
||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
||||
Primus is backend-agnostic and supports multiple training engines -- including Megatron.
|
||||
|
||||
.. note::
|
||||
|
||||
Primus with the Megatron-Core backend is intended to replace ROCm
|
||||
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||
workloads from Megatron-LM to Primus with Megatron-Core, see
|
||||
:doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM training <megatron-lm>` workflow.
|
||||
To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
|
||||
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||
containing essential components for Primus and Megatron-Core.
|
||||
For ease of use, AMD provides a ready-to-use Docker image for MI300 series GPUs
|
||||
containing essential components for Primus and Megatron-LM. This Docker is powered by Primus
|
||||
Turbo optimizations for performance; this release adds support for Primus Turbo
|
||||
with optimized attention and grouped GEMM kernels.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -46,7 +47,7 @@ containing essential components for Primus and Megatron-Core.
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series GPUs.
|
||||
Some instructions, commands, and training examples in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
@@ -113,7 +114,7 @@ system's configuration.
|
||||
=================
|
||||
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||
reproduce the benchmark results on MI300X series GPUs with the ``{{ docker.pull_tag }}`` image.
|
||||
|
||||
.. _amd-primus-megatron-lm-requirements:
|
||||
|
||||
@@ -151,8 +152,8 @@ system's configuration.
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
|
||||
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||
The Docker container hosts verified commit ``927a717`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/927a71702784347a311ca48fd45f0f308c6ef6dd>`__ repository.
|
||||
|
||||
.. _amd-primus-megatron-lm-environment-setup:
|
||||
|
||||
@@ -160,7 +161,7 @@ Configuration
|
||||
=============
|
||||
|
||||
Primus defines a training configuration in YAML for each model in
|
||||
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/megatron/configs>`__.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
@@ -205,11 +206,7 @@ You can use either mock data or real data for training.
|
||||
Tokenizer
|
||||
---------
|
||||
|
||||
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||
Set the ``HF_TOKEN`` environment variable with
|
||||
right permissions to access the tokenizer for each model.
|
||||
|
||||
.. code-block:: bash
|
||||
@@ -217,6 +214,14 @@ right permissions to access the tokenizer for each model.
|
||||
# Export your HF_TOKEN in the workspace
|
||||
export HF_TOKEN=<your_hftoken>
|
||||
|
||||
.. note::
|
||||
|
||||
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||
<https://github.com/AMD-AGI/Primus/blob/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||
definition.
|
||||
|
||||
.. _amd-primus-megatron-lm-run-training:
|
||||
|
||||
Run training
|
||||
@@ -224,7 +229,7 @@ Run training
|
||||
|
||||
Use the following example commands to set up the environment, configure
|
||||
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||
MI300X series GPUs with the AMD Megatron-LM environment.
|
||||
|
||||
Single node training
|
||||
--------------------
|
||||
@@ -237,10 +242,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.3 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.3 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -253,6 +260,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 8B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 8B FP8, run:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -271,6 +282,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -287,8 +302,7 @@ Once setup is complete, run the appropriate training command.
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 40 \
|
||||
--fp8 hybrid \
|
||||
--no_fp8_weight_transpose_cache true
|
||||
--fp8 hybrid
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -296,6 +310,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 7B FP8, run:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -314,16 +332,24 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 70B BF16, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh --train_iters 50
|
||||
bash ./examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V3.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -336,6 +362,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V2-Lite.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||
use the following command:
|
||||
|
||||
@@ -348,6 +378,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||
use the following command:
|
||||
|
||||
@@ -358,7 +392,11 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x22B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x22B (MoE with expert parallel) with 4-layer proxy,
|
||||
use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -373,6 +411,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||
command:
|
||||
|
||||
@@ -392,6 +434,10 @@ Once setup is complete, run the appropriate training command.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 72B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
|
||||
|
||||
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -399,11 +445,16 @@ Once setup is complete, run the appropriate training command.
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --train_iters 50
|
||||
|
||||
.. _amd-primus-megatron-multi-node-examples:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training.
|
||||
|
||||
To run training on multiple nodes, you can use the
|
||||
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||
`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/run_slurm_pretrain.sh>`__
|
||||
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
@@ -438,10 +489,9 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||
@@ -460,7 +510,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--global_batch_size 1024 \
|
||||
@@ -474,10 +524,9 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||
@@ -496,7 +545,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||
@@ -507,10 +556,9 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 10 \
|
||||
--global_batch_size 640 \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||
@@ -542,10 +590,9 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 8 \
|
||||
--global_batch_size 512 \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars:
|
||||
@@ -590,6 +637,18 @@ recompute_granularity
|
||||
num_layers
|
||||
For using a reduced number of layers as with proxy models.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
|
||||
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
@@ -598,5 +657,4 @@ of the ``ROCm/megatron-lm`` Docker image.
|
||||
|
||||
This training environment now uses Primus with Megatron as the primary
|
||||
configuration. Limited support for the legacy ROCm Megatron-LM is still
|
||||
available. For instructions on using ROCm Megatron-LM, see the
|
||||
:doc:`megatron-lm` document.
|
||||
available; see the :doc:`megatron-lm` documentation.
|
||||
|
||||
@@ -8,12 +8,12 @@ Training a model with Primus and PyTorch
|
||||
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
||||
Primus now supports the PyTorch torchtitan backend.
|
||||
|
||||
.. note::
|
||||
|
||||
Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
|
||||
See :doc:`pytorch-training` to see steps to run workloads without Primus.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
@@ -21,7 +21,7 @@ Primus now supports the PyTorch torchtitan backend.
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = dockers[0] %}
|
||||
For ease of use, AMD provides a ready-to-use Docker image -- ``{{
|
||||
docker.pull_tag }}`` -- for MI300X series accelerators containing essential
|
||||
docker.pull_tag }}`` -- for MI300X series GPUs containing essential
|
||||
components for Primus and PyTorch training with
|
||||
Primus Turbo optimizations.
|
||||
|
||||
@@ -41,7 +41,7 @@ Primus now supports the PyTorch torchtitan backend.
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
@@ -293,7 +293,7 @@ Further reading
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
@@ -10,7 +10,7 @@ Training a model with PyTorch on ROCm
|
||||
|
||||
.. note::
|
||||
|
||||
Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
|
||||
Primus with the PyTorch torchtitan backend is designed to replace :doc:`ROCm PyTorch training <pytorch-training>` workflow.
|
||||
See :doc:`primus-pytorch` for details.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
@@ -22,7 +22,7 @@ model training with GPU-optimized components for transformer-based models.
|
||||
{% set docker = dockers[0] %}
|
||||
The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
|
||||
(``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||
model on AMD Instinct MI325X and MI300X GPUs. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
.. list-table::
|
||||
@@ -41,7 +41,7 @@ model training with GPU-optimized components for transformer-based models.
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
@@ -126,7 +126,7 @@ popular AI models.
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -299,28 +299,28 @@ Run training
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
- `Tomli <https://pypi.org/project/tomli/>`__
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`__
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`__
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`__
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`__
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
@@ -336,50 +336,50 @@ Run training
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`__
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -521,9 +521,14 @@ Run training
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
|
||||
|
||||
Pre-training
|
||||
~~~~~~~~~~~~
|
||||
|
||||
@@ -571,7 +576,7 @@ Further reading
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
@@ -16,7 +16,7 @@ ROCm supports multiple programming languages and programming interfaces such as
|
||||
{doc}`HIP (Heterogeneous-Compute Interface for Portability)<hip:index>`, OpenCL,
|
||||
and OpenMP, as explained in the [Programming guide](./how-to/programming_guide.rst).
|
||||
|
||||
If you're using AMD Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, review {doc}`Radeon-specific ROCm documentation<radeon:index>`.
|
||||
If you're using AMD Radeon GPUs or Ryzen APUs in a workstation setting with a display connected, review {doc}`ROCm on Radeon and Ryzen documentation<radeon:index>`.
|
||||
|
||||
ROCm documentation is organized into the following categories:
|
||||
|
||||
@@ -29,7 +29,7 @@ ROCm documentation is organized into the following categories:
|
||||
|
||||
* {doc}`ROCm on Linux <rocm-install-on-linux:reference/system-requirements>`
|
||||
* {doc}`HIP SDK on Windows <rocm-install-on-windows:reference/system-requirements>`
|
||||
* [ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html)
|
||||
* {doc}`ROCm on Radeon and Ryzen<radeon:index>`
|
||||
* {doc}`Deep learning frameworks </how-to/deep-learning-rocm>`
|
||||
* {doc}`Build from source </how-to/build-rocm>`
|
||||
:::
|
||||
|
||||
@@ -23,8 +23,8 @@ subtrees:
|
||||
title: ROCm on Linux
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/latest/
|
||||
title: HIP SDK on Windows
|
||||
- url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
|
||||
title: ROCm on Radeon GPUs
|
||||
- url: https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/index.html
|
||||
title: ROCm on Radeon and Ryzen
|
||||
- file: how-to/deep-learning-rocm.md
|
||||
title: Deep learning frameworks
|
||||
subtrees:
|
||||
@@ -60,8 +60,15 @@ subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/system-health-check.rst
|
||||
title: System health benchmarks
|
||||
- file: how-to/rocm-for-ai/system-setup/index.rst
|
||||
title: System setup
|
||||
entries:
|
||||
- file: how-to/rocm-for-ai/system-setup/prerequisite-system-validation.rst
|
||||
title: System validation
|
||||
- file: how-to/rocm-for-ai/system-setup/multi-node-setup.rst
|
||||
title: Multi-node setup
|
||||
- file: how-to/rocm-for-ai/system-setup/system-health-check.rst
|
||||
title: System health benchmarks
|
||||
- file: how-to/rocm-for-ai/training/index.rst
|
||||
title: Training
|
||||
subtrees:
|
||||
|
||||
Reference in New Issue
Block a user