mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 22:58:17 -05:00
Compare commits
30 Commits
add-catch2
...
20250912-4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7dc612421c | ||
|
|
d72c0c3cc2 | ||
|
|
f909bf3635 | ||
|
|
7cbd4b2fc6 | ||
|
|
9d63c045a1 | ||
|
|
d55450070c | ||
|
|
f0dd80e23e | ||
|
|
eb3cf0ec1c | ||
|
|
6075acfd79 | ||
|
|
2b85816b32 | ||
|
|
964a7cd0b5 | ||
|
|
d3fe7439cf | ||
|
|
56f566c1dc | ||
|
|
88f1493b68 | ||
|
|
3ca9cb1fcc | ||
|
|
0840c14b6d | ||
|
|
daa0184d2e | ||
|
|
3b5019e03f | ||
|
|
68f505e375 | ||
|
|
05a66f75fe | ||
|
|
3c37ae88f0 | ||
|
|
985786e98d | ||
|
|
f25e27acf0 | ||
|
|
db43d18c37 | ||
|
|
4f53183696 | ||
|
|
94476f34ca | ||
|
|
4bc1bf00c6 | ||
|
|
76fd6b2290 | ||
|
|
e5345a9cca | ||
|
|
2f40189575 |
@@ -178,7 +178,7 @@ jobs:
|
||||
mkdir -p $(Agent.BuildDirectory)/temp-deps
|
||||
cd $(Agent.BuildDirectory)/temp-deps
|
||||
# position-independent LAPACK is required for almalinux8 builds
|
||||
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
|
||||
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
|
||||
make -j
|
||||
sudo make install
|
||||
- script: |
|
||||
@@ -197,6 +197,8 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
|
||||
|
||||
236
.azuredevops/components/origami.yml
Normal file
236
.azuredevops/components/origami.yml
Normal file
@@ -0,0 +1,236 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: origami
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
type: boolean
|
||||
default: false
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- git
|
||||
- ninja-build
|
||||
- wget
|
||||
- python3
|
||||
- python3-dev
|
||||
- python3-pip
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
- nanobind>=2.0.0
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-register
|
||||
- name: rocmTestDependencies
|
||||
type: object
|
||||
default:
|
||||
- clr
|
||||
- llvm-project
|
||||
- rocm-cmake
|
||||
- rocminfo
|
||||
- ROCR-Runtime
|
||||
- rocprofiler-register
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt }
|
||||
- { os: almalinux8, packageManager: dnf }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- name: downstreamComponentMatrix
|
||||
type: object
|
||||
default:
|
||||
- hipBLASLt:
|
||||
name: hipBLASLt
|
||||
sparseCheckoutDir: projects/hipblaslt
|
||||
skipUnifiedBuild: 'false'
|
||||
buildDependsOn:
|
||||
- origami_build
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: origami_build_${{ job.os }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
- name: ROCM_PATH
|
||||
value: $(Agent.BuildDirectory)/rocm
|
||||
pool:
|
||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||
-DORIGAMI_ENABLE_PYTHON=ON
|
||||
-DORIGAMI_BUILD_TESTING=ON
|
||||
-GNinja
|
||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||
- task: PublishPipelineArtifact@1
|
||||
displayName: 'Publish Build Directory Artifact'
|
||||
inputs:
|
||||
targetPath: '$(Agent.BuildDirectory)/s/build'
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
|
||||
publishLocation: 'pipeline'
|
||||
- task: PublishPipelineArtifact@1
|
||||
displayName: 'Publish Python Source Artifact'
|
||||
inputs:
|
||||
targetPath: '$(Agent.BuildDirectory)/s/python'
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
|
||||
publishLocation: 'pipeline'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
componentName: ${{ parameters.componentName }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: origami_test_${{ job.os }}_${{ job.target }}
|
||||
timeoutInMinutes: 120
|
||||
dependsOn: origami_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: 'Download Build Directory Artifact'
|
||||
inputs:
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
|
||||
path: '$(Agent.BuildDirectory)/s/build'
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: 'Download Python Source Artifact'
|
||||
inputs:
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
|
||||
path: '$(Agent.BuildDirectory)/s/python'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
- script: |
|
||||
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
||||
|
||||
echo "--- Running origami_test.py ---"
|
||||
python3 $(Agent.BuildDirectory)/s/python/origami_test.py
|
||||
|
||||
echo "--- Running origami_grid_test.py ---"
|
||||
python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
|
||||
displayName: 'Run Python Binding Tests'
|
||||
condition: succeeded()
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||
buildDependsOn: ${{ component.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||
triggerDownstreamJobs: true
|
||||
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||
@@ -8,6 +8,25 @@ parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: rocPyDecodeRepo
|
||||
type: string
|
||||
default: rocpydecode_repo
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -56,10 +75,23 @@ parameters:
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- name: downstreamComponentMatrix
|
||||
type: object
|
||||
default:
|
||||
- rocPyDecode:
|
||||
name: rocPyDecode
|
||||
sparseCheckoutDir: ''
|
||||
skipUnifiedBuild: 'false'
|
||||
buildDependsOn:
|
||||
- rocDecode_build
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -83,12 +115,15 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
@@ -169,3 +204,15 @@ jobs:
|
||||
registerROCmPackages: true
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
|
||||
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||
buildDependsOn: ${{ component.buildDependsOn }}
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||
triggerDownstreamJobs: true
|
||||
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||
|
||||
@@ -5,6 +5,22 @@ parameters:
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -47,19 +63,19 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
testJobs:
|
||||
- gfx942:
|
||||
target: gfx942
|
||||
- gfx90a:
|
||||
target: gfx90a
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocPyDecode_build_${{ job.target }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -74,16 +90,20 @@ jobs:
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- task: Bash@3
|
||||
displayName: 'Save Python Package Paths'
|
||||
inputs:
|
||||
|
||||
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocm-core
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -27,6 +46,10 @@ parameters:
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocm_core_${{ job.os }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
pool:
|
||||
${{ if eq(job.os, 'ubuntu2404') }}:
|
||||
vmImage: 'ubuntu-24.04'
|
||||
@@ -50,8 +73,10 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
useAmdclang: false
|
||||
extraBuildFlags: >-
|
||||
@@ -65,9 +90,12 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
|
||||
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocm-smi-lib
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -32,6 +51,10 @@ parameters:
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocm_smi_lib_build_${{ job.os }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
pool:
|
||||
${{ if eq(job.os, 'ubuntu2404') }}:
|
||||
vmImage: 'ubuntu-24.04'
|
||||
@@ -55,8 +78,10 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
useAmdclang: false
|
||||
extraBuildFlags: >-
|
||||
@@ -65,51 +90,56 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
# parameters:
|
||||
# aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: rocm_smi_lib_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm_smi_lib
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
|
||||
dependsOn: rocm_smi_lib_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
|
||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
@@ -1,10 +1,29 @@
|
||||
parameters:
|
||||
- name: componentName
|
||||
type: string
|
||||
default: rocminfo
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
# monorepo related parameters
|
||||
- name: sparseCheckoutDir
|
||||
type: string
|
||||
default: ''
|
||||
- name: triggerDownstreamJobs
|
||||
type: boolean
|
||||
default: false
|
||||
- name: downstreamAggregateNames
|
||||
type: string
|
||||
default: ''
|
||||
- name: buildDependsOn
|
||||
type: object
|
||||
default: null
|
||||
- name: unifiedBuild
|
||||
type: boolean
|
||||
default: false
|
||||
# set to true if doing full build of ROCm stack
|
||||
# and dependencies are pulled from same pipeline
|
||||
- name: aggregatePipeline
|
||||
@@ -40,7 +59,11 @@ parameters:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: rocminfo_build_${{ job.os }}
|
||||
- job: ${{ parameters.componentName }}_build_${{ job.os }}
|
||||
${{ if parameters.buildDependsOn }}:
|
||||
dependsOn:
|
||||
- ${{ each build in parameters.buildDependsOn }}:
|
||||
- ${{ build }}_${{ job.os }}
|
||||
pool:
|
||||
vmImage: 'ubuntu-22.04'
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
@@ -62,14 +85,18 @@ jobs:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||
os: ${{ job.os }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
useAmdclang: false
|
||||
extraBuildFlags: >-
|
||||
@@ -78,65 +105,71 @@ jobs:
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocminfo_test_${{ job.target }}
|
||||
dependsOn: rocminfo_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocminfo
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: './rocm/bin/rocminfo'
|
||||
testParameters: ''
|
||||
testPublishResults: false
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm_agent_enumerator
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: './rocm/bin/rocm_agent_enumerator'
|
||||
testParameters: ''
|
||||
testPublishResults: false
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
registerROCmPackages: true
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: rocminfo_test_${{ job.target }}
|
||||
dependsOn: rocminfo_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool: ${{ job.target }}_test_pool
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
registerROCmPackages: true
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
os: ${{ job.os }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
parameters:
|
||||
runRocminfo: false
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: './rocm/bin/rocminfo'
|
||||
testParameters: ''
|
||||
testPublishResults: false
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: rocm_agent_enumerator
|
||||
testDir: '$(Agent.BuildDirectory)'
|
||||
testExecutable: './rocm/bin/rocm_agent_enumerator'
|
||||
testParameters: ''
|
||||
testPublishResults: false
|
||||
os: ${{ job.os }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
registerROCmPackages: true
|
||||
environment: test
|
||||
gpuTarget: ${{ job.target }}
|
||||
|
||||
@@ -55,6 +55,7 @@ parameters:
|
||||
- pymongo
|
||||
- pyyaml
|
||||
- setuptools
|
||||
- sqlalchemy
|
||||
- tabulate
|
||||
- textual
|
||||
- textual_plotext
|
||||
|
||||
63
.azuredevops/dependencies/catch2.yml
Normal file
63
.azuredevops/dependencies/catch2.yml
Normal file
@@ -0,0 +1,63 @@
|
||||
parameters:
|
||||
- name: checkoutRepo
|
||||
type: string
|
||||
default: 'self'
|
||||
- name: checkoutRef
|
||||
type: string
|
||||
default: ''
|
||||
- name: catch2Version
|
||||
type: string
|
||||
default: ''
|
||||
- name: aptPackages
|
||||
type: object
|
||||
default:
|
||||
- cmake
|
||||
- git
|
||||
- ninja-build
|
||||
|
||||
- name: jobMatrix
|
||||
type: object
|
||||
default:
|
||||
buildJobs:
|
||||
- { os: ubuntu2204, packageManager: apt}
|
||||
- { os: almalinux8, packageManager: dnf}
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||
- job: catch2_${{ job.os }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
pool:
|
||||
vmImage: 'ubuntu-22.04'
|
||||
${{ if eq(job.os, 'almalinux8') }}:
|
||||
container:
|
||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||
endpoint: ContainerService3
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- checkout: none
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- task: Bash@3
|
||||
displayName: Clone catch2 ${{ parameters.catch2Version }}
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
|
||||
workingDirectory: $(Agent.BuildDirectory)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
|
||||
cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
|
||||
useAmdclang: false
|
||||
extraBuildFlags: >-
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-GNinja
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||
parameters:
|
||||
os: ${{ job.os }}
|
||||
23
.azuredevops/tag-builds/catch2.yml
Normal file
23
.azuredevops/tag-builds/catch2.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
|
||||
parameters:
|
||||
- name: catch2Version
|
||||
type: string
|
||||
default: "v3.7.0"
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: pipelines_repo
|
||||
type: github
|
||||
endpoint: ROCm
|
||||
name: ROCm/ROCm
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
|
||||
jobs:
|
||||
- template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
|
||||
parameters:
|
||||
catch2Version: ${{ parameters.catch2Version }}
|
||||
@@ -171,16 +171,16 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
rocm-core:
|
||||
pipelineId: 103
|
||||
developBranch: master
|
||||
pipelineId: 349
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
rocm-examples:
|
||||
pipelineId: 216
|
||||
developBranch: amd-staging
|
||||
hasGpuTarget: true
|
||||
rocminfo:
|
||||
pipelineId: 91
|
||||
developBranch: amd-staging
|
||||
pipelineId: 356
|
||||
developBranch: develop
|
||||
hasGpuTarget: false
|
||||
rocMLIR:
|
||||
pipelineId: 229
|
||||
@@ -251,8 +251,8 @@ parameters:
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
roctracer:
|
||||
pipelineId: 141
|
||||
developBranch: amd-staging
|
||||
pipelineId: 331
|
||||
developBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocWMMA:
|
||||
pipelineId: 109
|
||||
|
||||
@@ -8,11 +8,13 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
boost: 250
|
||||
catch2: 343
|
||||
fmtlib: 341
|
||||
grpc: 72
|
||||
gtest: 73
|
||||
half560: 68
|
||||
lapack: 69
|
||||
libdivide: 342
|
||||
spdlog: 340
|
||||
|
||||
steps:
|
||||
@@ -31,7 +33,7 @@ steps:
|
||||
inputs:
|
||||
archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
|
||||
destinationFolder: $(Agent.BuildDirectory)/vendor
|
||||
cleanDestinationFolder: true
|
||||
cleanDestinationFolder: false
|
||||
overwriteExistingFiles: true
|
||||
- task: DeleteFiles@1
|
||||
displayName: Clean up ${{ dependency }}
|
||||
|
||||
@@ -156,6 +156,7 @@ GEMMs
|
||||
GFLOPS
|
||||
GFortran
|
||||
GFXIP
|
||||
GGUF
|
||||
Gemma
|
||||
GiB
|
||||
GIM
|
||||
@@ -293,6 +294,7 @@ Multicore
|
||||
Multithreaded
|
||||
MyEnvironment
|
||||
MyST
|
||||
NANOO
|
||||
NBIO
|
||||
NBIOs
|
||||
NCCL
|
||||
@@ -500,6 +502,7 @@ Unhandled
|
||||
VALU
|
||||
VBIOS
|
||||
VCN
|
||||
verl's
|
||||
VGPR
|
||||
VGPRs
|
||||
VM
|
||||
@@ -742,6 +745,7 @@ logits
|
||||
lossy
|
||||
macOS
|
||||
matchers
|
||||
maxtext
|
||||
megatron
|
||||
microarchitecture
|
||||
migraphx
|
||||
@@ -918,6 +922,7 @@ toolchain
|
||||
toolchains
|
||||
toolset
|
||||
toolsets
|
||||
torchtitan
|
||||
torchvision
|
||||
tqdm
|
||||
tracebacks
|
||||
|
||||
@@ -57,9 +57,8 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
|
||||
|
||||
For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
|
||||
|
||||
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
|
||||
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning framework:
|
||||
|
||||
* Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
|
||||
* Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
|
||||
|
||||
* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.
|
||||
|
||||
37
default.xml
37
default.xml
@@ -1,12 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/rocm-6.4.3"
|
||||
<default revision="refs/tags/20250912-42"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="ROCK-Kernel-Driver" />
|
||||
<project name="aqlprofile" />
|
||||
<project name="ROCR-Runtime" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rdc" />
|
||||
@@ -37,36 +37,37 @@
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIOpen" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="Tensile" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipBLAS-common" />
|
||||
<project groups="mathlibs" name="hipBLAS" />
|
||||
<project groups="mathlibs" name="hipBLASLt" />
|
||||
<project groups="mathlibs" name="hipCUB" />
|
||||
<project groups="mathlibs" name="hipFFT" />
|
||||
<project groups="mathlibs" name="hipRAND" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipSPARSE" />
|
||||
<project groups="mathlibs" name="hipSPARSELt" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocBLAS" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<project groups="mathlibs" name="rocm-libraries">
|
||||
<linkfile src="projects/hipcub" dest="hipCUB"/>
|
||||
<linkfile src="projects/rocprim" dest="rocPRIM"/>
|
||||
<linkfile src="projects/hiprand" dest="hipRAND"/>
|
||||
<linkfile src="projects/rocrand" dest="rocRAND"/>
|
||||
<linkfile src="projects/rocthrust" dest="rocThrust"/>
|
||||
<linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
|
||||
<linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
|
||||
<linkfile src="projects/rocblas" dest="rocBLAS"/>
|
||||
<linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
|
||||
<linkfile src="projects/rocsparse" dest="rocSPARSE"/>
|
||||
<linkfile src="projects/hipsparse" dest="hipSPARSE"/>
|
||||
<linkfile src="projects/hipblas" dest="hipBLAS"/>
|
||||
<linkfile src="projects/miopen" dest="MIOpen"/>
|
||||
<linkfile src="projects/hipfft" dest="hipFFT"/>
|
||||
<linkfile src="projects/rocfft" dest="rocFFT"/>
|
||||
</project>
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocFFT" />
|
||||
<project groups="mathlibs" name="rocPRIM" />
|
||||
<project groups="mathlibs" name="rocRAND" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocSPARSE" />
|
||||
<project groups="mathlibs" name="rocThrust" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
|
||||
@@ -35,6 +35,8 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,
|
||||
|
||||
|
@@ -246,6 +246,8 @@ Expand for full historical view of:
|
||||
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
||||
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
|
||||
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
||||
.. [#ray_compat] Ray is only supported on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||
|
||||
|
||||
156
docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
Normal file
156
docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
Normal file
@@ -0,0 +1,156 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: llama.cpp deep learning framework compatibility
|
||||
:keywords: GPU, GGML, llama.cpp compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
********************************************************************************
|
||||
llama.cpp compatibility
|
||||
********************************************************************************
|
||||
|
||||
`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework
|
||||
for Large Language Model (LLM) inference that runs on both central processing units
|
||||
(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing
|
||||
a simple, dependency-free setup.
|
||||
|
||||
The framework supports multiple quantization options, from 1.5-bit to 8-bit integers,
|
||||
to speed up inference and reduce memory usage. Originally built as a CPU-first library,
|
||||
llama.cpp is easy to integrate with other programming environments and is widely
|
||||
adopted across diverse platforms, including consumer devices.
|
||||
|
||||
ROCm support for llama.cpp is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
|
||||
<https://github.com/ROCm/llama.cpp>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
|
||||
|
||||
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
|
||||
which includes ROCm, llama.cpp, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||
to install and get started.
|
||||
|
||||
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
|
||||
in the upstream llama.cpp documentation.
|
||||
|
||||
.. note::
|
||||
|
||||
llama.cpp is supported on ROCm 6.4.0.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||
|
||||
- Plain C/C++ implementation with no external dependencies
|
||||
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||
|
||||
llama.cpp is also used in a range of real-world applications, including:
|
||||
|
||||
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||
- Various other AI applications use llama.cpp as their inference engine;
|
||||
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||
|
||||
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__,
|
||||
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||
AMD Instinct GPUs within the ROCm ecosystem.
|
||||
|
||||
.. _llama-cpp-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. important::
|
||||
|
||||
Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
|
||||
|
||||
- Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
|
||||
- Server: This image only includes the server executable file.
|
||||
- Light: This image only includes the main executable file.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Full Docker
|
||||
- Server Docker
|
||||
- Light Docker
|
||||
- llama.cpp
|
||||
- Ubuntu
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
|
||||
- 24.04
|
||||
|
||||
Key ROCm libraries for llama.cpp
|
||||
================================================================================
|
||||
|
||||
llama.cpp functionality on ROCm is determined by its underlying library
|
||||
dependencies. These ROCm components affect the capabilities, performance, and
|
||||
feature set available to developers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ROCm library
|
||||
- Version
|
||||
- Purpose
|
||||
- Usage
|
||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||
- :version-ref:`hipBLAS rocm_version`
|
||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||
matrix and vector operations.
|
||||
- Supports operations such as matrix multiplication, matrix-vector
|
||||
products, and tensor contractions. Utilized in both dense and batched
|
||||
linear algebra operations.
|
||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||
- :version-ref:`hipBLASLt rocm_version`
|
||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||
features like epilogues fused into the matrix multiplication kernel or
|
||||
use of integer tensor cores.
|
||||
- By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
|
||||
kernels where possible.
|
||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||
- :version-ref:`rocWMMA rocm_version`
|
||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
- Can be used to enhance the flash attention performance on AMD compute, by enabling
|
||||
the flag during compile time.
|
||||
111
docs/compatibility/ml-compatibility/ray-compatibility.rst
Normal file
111
docs/compatibility/ml-compatibility/ray-compatibility.rst
Normal file
@@ -0,0 +1,111 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Ray deep learning framework compatibility
|
||||
:keywords: GPU, Ray compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Ray compatibility
|
||||
*******************************************************************************
|
||||
|
||||
Ray is a unified framework for scaling AI and Python applications from your laptop
|
||||
to a full cluster, without changing your code. Ray consists of `a core distributed
|
||||
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of
|
||||
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for
|
||||
simplifying machine learning computations.
|
||||
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||
|
||||
ROCm support for Ray is upstreamed, and you can build the official source code
|
||||
with ROCm support:
|
||||
|
||||
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
|
||||
<https://github.com/ROCm/ray>`_ repository.
|
||||
|
||||
- Due to independent compatibility considerations, this location differs from the
|
||||
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
|
||||
|
||||
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for instructions to get started.
|
||||
|
||||
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
|
||||
in the upstream Ray documentation.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
Ray is supported on ROCm 6.4.1.
|
||||
|
||||
Supported devices
|
||||
================================================================================
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm
|
||||
Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog provides an overview of Volcano Engine Reinforcement Learning (verl)
|
||||
for large language models (LLMs) and discusses its benefits in large-scale
|
||||
reinforcement learning from human feedback (RLHF). It uses Ray as part of a
|
||||
hybrid orchestration engine to schedule and coordinate training and inference
|
||||
tasks in parallel, enabling optimized resource utilization and potential overlap
|
||||
between these phases. This dynamic resource allocation strategy significantly
|
||||
improves overall system efficiency. The blog presents verl’s performance results,
|
||||
focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X
|
||||
GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and
|
||||
accelerate your RLHF training with ROCm-optimized performance.
|
||||
|
||||
* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
|
||||
blog post describes key use cases such as training and inference for large language models (LLMs),
|
||||
model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale
|
||||
workloads using Ray in the ROCm environment.
|
||||
|
||||
For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support
|
||||
topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__
|
||||
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
.. _ray-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
|
||||
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
|
||||
icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||
@@ -108,6 +108,8 @@ article_pages = [
|
||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
@@ -124,11 +126,15 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||
|
||||
@@ -78,7 +78,11 @@ vllm_benchmark:
|
||||
model_repo: Qwen/QwQ-32B
|
||||
url: https://huggingface.co/Qwen/QwQ-32B
|
||||
precision: float16
|
||||
tunableop: true
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.5.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.x.x
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
components:
|
||||
ROCm: 6.4.1
|
||||
JAX: 0.6.0
|
||||
Python: 3.10.12
|
||||
Transformer Engine: 2.1.0+90d703dd
|
||||
hipBLASLt: 1.1.0-499ece1c21
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_training_script: llama3_8b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_training_script: llama3_70b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_7b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_70b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
@@ -0,0 +1,120 @@
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/pytorch-training:v25.6
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
||||
rocm_version: 6.4.1
|
||||
pytorch_version: 2.8.0a0+git7d205b2
|
||||
python_version: 3.10.17
|
||||
transformer_engine_version: 1.14.0+2f85f5f2
|
||||
flash_attention_version: 3.0.0.post1
|
||||
hipblaslt_version: 0.15.0-8c6919d
|
||||
triton_version: 3.3.0
|
||||
model_groups:
|
||||
- group: Pre-training
|
||||
tag: pre-training
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Fine-tuning
|
||||
tag: fine-tuning
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora, HF_finetune_lora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
||||
@@ -1,38 +1,17 @@
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/pytorch-training:v25.6
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
||||
rocm_version: 6.4.1
|
||||
pytorch_version: 2.8.0a0+git7d205b2
|
||||
python_version: 3.10.17
|
||||
transformer_engine_version: 1.14.0+2f85f5f2
|
||||
flash_attention_version: 3.0.0.post1
|
||||
hipblaslt_version: 0.15.0-8c6919d
|
||||
triton_version: 3.3.0
|
||||
dockers:
|
||||
- pull_tag: rocm/pytorch-training:v25.7
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
|
||||
components:
|
||||
ROCm: 6.4.2
|
||||
PyTorch: 2.8.0a0+gitd06a406
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+94e53dd8
|
||||
Flash Attention: 3.0.0.post1
|
||||
hipBLASLt: 1.1.0-4b9a52edfc
|
||||
Triton: 3.3.0
|
||||
model_groups:
|
||||
- group: Pre-training
|
||||
tag: pre-training
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Fine-tuning
|
||||
tag: fine-tuning
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
@@ -75,19 +54,19 @@ model_groups:
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora, HF_finetune_lora]
|
||||
training_modes: [finetune_qlora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
@@ -117,4 +96,67 @@ model_groups:
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
||||
training_modes: [finetune_lora, finetune_qlora]
|
||||
- group: OpenAI
|
||||
tag: openai
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_train_gpt_oss_20b
|
||||
model_repo: GPT-OSS-20B
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_train_gpt_oss_120b
|
||||
model_repo: GPT-OSS-120B
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 3 8B
|
||||
mad_tag: pyt_train_qwen3-8b
|
||||
model_repo: Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 3 32B
|
||||
mad_tag: pyt_train_qwen3-32b
|
||||
model_repo: Qwen3-32
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 32B
|
||||
mad_tag: pyt_train_qwen2.5-32b
|
||||
model_repo: Qwen2.5-32B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_train_qwen2.5-72b
|
||||
model_repo: Qwen2.5-72B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2 1.5B
|
||||
mad_tag: pyt_train_qwen2-1.5b
|
||||
model_repo: Qwen2-1.5B
|
||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 2 7B
|
||||
mad_tag: pyt_train_qwen2-7b
|
||||
model_repo: Qwen2-7B
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
|
||||
@@ -23,93 +23,114 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
||||
- Installation options
|
||||
- GitHub
|
||||
|
||||
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
|
||||
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_
|
||||
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
|
||||
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
|
||||
|
||||
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
|
||||
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
|
||||
|
||||
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
|
||||
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
|
||||
|
||||
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
|
||||
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
|
||||
|
||||
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
|
||||
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
||||
through the following guides.
|
||||
@@ -124,10 +145,3 @@ through the following guides.
|
||||
|
||||
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
|
||||
The GEMM library
|
||||
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
|
||||
provides a benchmark tool for its supported operations. Refer to the
|
||||
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
|
||||
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
|
||||
for details.
|
||||
|
||||
* Example 1: Benchmark mix fp8 GEMM
|
||||
|
||||
@@ -46,7 +46,7 @@ vLLM inference performance testing
|
||||
- {{ unified_docker.hipblaslt_version }}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-715>` for
|
||||
MI300X series accelerators.
|
||||
|
||||
What's new
|
||||
@@ -219,7 +219,7 @@ system's configuration.
|
||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||
|
||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
||||
Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
|
||||
to collect latency and throughput performance data, you can also change the benchmarking
|
||||
parameters. See the standalone benchmarking tab for more information.
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ vLLM inference performance testing
|
||||
- {{ unified_docker.hipblaslt_version }}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||
MI300X series accelerators.
|
||||
|
||||
What's new
|
||||
@@ -208,7 +208,7 @@ system's configuration.
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models>` are preconfigured to collect
|
||||
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
|
||||
<rocm-install-on-linux:install/quick-start>`.
|
||||
|
||||
If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
|
||||
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
|
||||
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
|
||||
|
||||
You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
||||
You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
||||
distribution's package manager. See the following documentation resources to get started:
|
||||
|
||||
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
:description: How to train a model using JAX MaxText for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with MaxText for ROCm
|
||||
**************************************
|
||||
******************************************
|
||||
Training a model with JAX MaxText for ROCm
|
||||
******************************************
|
||||
|
||||
MaxText is a high-performance, open-source framework built on the Google JAX
|
||||
machine learning library to train LLMs at scale. The MaxText framework for
|
||||
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
|
||||
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
||||
on AMD MI300X series accelerators.
|
||||
|
||||
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
|
||||
The MaxText for ROCm training Docker image
|
||||
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||
It includes the following software components:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| JAX | 0.4.35 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10.12 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.12.0.dev0+b8b92dc |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | 0.13.0-ae9c477a |
|
||||
+--------------------------+--------------------------------+
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
MaxText provides the following key features to train large language models efficiently:
|
||||
{% for docker in dockers %}
|
||||
{% set jax_version = docker.components["JAX"] %}
|
||||
|
||||
.. tab-item:: JAX {{ jax_version }}
|
||||
:sync: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
|
||||
{% endfor %}
|
||||
{% if jax_version == "0.6.0" %}
|
||||
.. note::
|
||||
|
||||
Shardy is a new config in JAX 0.6.0. You might get related errors if it's
|
||||
not configured correctly. For now you can turn it off by setting
|
||||
``shardy=False`` during the training run. You can also follow the `migration
|
||||
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
|
||||
it.
|
||||
|
||||
The provided multi-node training scripts in this documentation are
|
||||
not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
|
||||
Docker image.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
MaxText with on ROCm provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- Flash Attention (FA) 3
|
||||
- Flash Attention (FA) 3 -- with or without sequence input packing
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Multi-node support
|
||||
|
||||
.. _amd-maxtext-model-support:
|
||||
- NANOO FP8 quantization support
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
.. _amd-maxtext-model-support-v257:
|
||||
|
||||
* Llama 3.3 70B
|
||||
Supported models
|
||||
================
|
||||
|
||||
* Llama 3.1 8B
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300
|
||||
series accelerators. Some instructions, commands, and available training
|
||||
configurations in this documentation might vary by model -- select one to get
|
||||
started.
|
||||
|
||||
* Llama 3.1 70B
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
|
||||
* Llama 3 8B
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
* Llama 3 70B
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* DeepSeek-V2-Lite
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
Unsupported features
|
||||
--------------------
|
||||
|
||||
Currently, MaxText's default packed input format is not supported. Using this format
|
||||
with the current Docker image results in incorrect attention calculations
|
||||
across different input sequences. Support for packed input format is planned for a future release.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
|
||||
as follows. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
.. _amd-maxtext-multi-node-setup:
|
||||
.. _amd-maxtext-multi-node-setup-v257:
|
||||
|
||||
Multi-node setup
|
||||
----------------
|
||||
|
||||
For multi-node environments, ensure you have all the necessary packages for
|
||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
|
||||
|
||||
1. Install the following packages to build and install the RDMA driver.
|
||||
|
||||
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
# If using Mellanox NIC
|
||||
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||
|
||||
.. _amd-maxtext-download-docker:
|
||||
.. _amd-maxtext-get-started-v257:
|
||||
|
||||
Pull the Docker image
|
||||
---------------------
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. code-block:: shell
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
|
||||
docker pull rocm/jax-training:maxtext-v25.5
|
||||
.. _vllm-benchmark-mad:
|
||||
|
||||
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
||||
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
|
||||
and execute the benchmark.
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. code-block:: shell
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
|
||||
.. tab-set::
|
||||
|
||||
.. _amd-maxtext-get-started:
|
||||
{% if model.mad_tag and "single-node" in model.doc_options %}
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
Getting started
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. Use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv/``.
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required scripts
|
||||
|
||||
Run the JAX MaxText benchmark tool independently by starting the
|
||||
Docker container as shown in the following snippet.
|
||||
|
||||
.. tab-set::
|
||||
{% for docker in dockers %}
|
||||
{% set jax_version = docker.components["JAX"] %}
|
||||
|
||||
.. tab-item:: JAX {{ jax_version }}
|
||||
:sync: {{ docker.pull_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
{% if model.model_repo and "single-node" in model.doc_options %}
|
||||
.. rubric:: Single node training
|
||||
|
||||
1. Set up environment variables.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
|
||||
export HF_HOME=<Location of saved/cached Hugging Face models>
|
||||
|
||||
``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
|
||||
See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
|
||||
|
||||
``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
|
||||
If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
|
||||
Downloaded files typically get cached to ``~/.cache/huggingface``.
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
{% for docker in dockers %}
|
||||
{% set jax_version = docker.components["JAX"] %}
|
||||
|
||||
.. tab-item:: JAX {{ jax_version }}
|
||||
:sync: {{ docker.pull_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device=/dev/dri \
|
||||
--device=/dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
-v $HF_HOME:/hf_cache \
|
||||
-e HF_HOME=/hf_cache \
|
||||
-e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
3. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/jax-maxtext
|
||||
|
||||
4. Run the setup scripts to install libraries and datasets needed
|
||||
for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
|
||||
|
||||
5. To run the training benchmark without quantization, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
|
||||
|
||||
For quantized training, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
|
||||
|
||||
.. important::
|
||||
|
||||
Quantized training is not supported with the JAX 0.6.0 Docker image; support
|
||||
will be added in a future release. For quantized training, use the JAX 0.5.0
|
||||
Docker image: ``rocm/jax-training:maxtext-v25.7``.
|
||||
|
||||
{% endif %}
|
||||
{% if model.multinode_training_script and "multi-node" in model.doc_options %}
|
||||
.. rubric:: Multi-node training
|
||||
|
||||
The following examples use SLURM to run on multiple nodes.
|
||||
|
||||
.. note::
|
||||
|
||||
The following scripts will launch the Docker container and run the
|
||||
benchmark. Run them outside of any Docker container.
|
||||
|
||||
1. Make sure ``$HF_HOME`` is set before running the test. See
|
||||
`ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
|
||||
for more details on downloading the Llama models before running the
|
||||
benchmark.
|
||||
|
||||
2. To run multi-node training for {{ model.model }},
|
||||
use the
|
||||
`multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
|
||||
under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
|
||||
|
||||
3. Run the multi-node training benchmark script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> {{ model.multinode_training_script }}
|
||||
|
||||
{% else %}
|
||||
.. rubric:: Multi-node training
|
||||
|
||||
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
|
||||
with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
The following examples demonstrate how to get started with single node
|
||||
and multi-node training using the benchmarking scripts provided at
|
||||
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
||||
- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.
|
||||
|
||||
.. important::
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
||||
set correctly and points to your Hugging Face cache directory. Refer to the
|
||||
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
||||
for more detailed instructions.
|
||||
|
||||
Single node training benchmarking examples
|
||||
------------------------------------------
|
||||
|
||||
* Example 1: Single node training with Llama 2 7B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
|
||||
|
||||
* Example 2: Single node training with Llama 2 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
|
||||
|
||||
* Example 3: Single node training with Llama 3 8B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
|
||||
|
||||
* Example 4: Single node training with Llama 3 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
|
||||
|
||||
* Example 5: Single node training with Llama 3.3 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
|
||||
|
||||
* Example 6: Single node training with DeepSeek V2 16B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
|
||||
|
||||
.. note::
|
||||
|
||||
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
|
||||
the tokens/s as a performance indicator.
|
||||
|
||||
Multi-node training benchmarking examples
|
||||
-----------------------------------------
|
||||
|
||||
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
|
||||
own cluster setup.
|
||||
|
||||
* Example 1: Multi-node training with Llama 2 7B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama2_7b_multinode.sh
|
||||
|
||||
* Example 2: Multi-node training with Llama 2 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama2_70b_multinode.sh
|
||||
|
||||
* Example 3: Multi-node training with Llama 3 8B model
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama3_8b_multinode.sh
|
||||
|
||||
* Example 4: Multi-node training with Llama 3 70B model
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama3_70b_multinode.sh
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - 25.5 (latest)
|
||||
* - 25.7 (latest)
|
||||
-
|
||||
* ROCm 6.4.1
|
||||
* JAX 0.6.0, 0.5.0
|
||||
-
|
||||
* :doc:`Documentation <../jax-maxtext>`
|
||||
* `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
|
||||
* `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
|
||||
|
||||
* - 25.5
|
||||
-
|
||||
* ROCm 6.3.4
|
||||
* JAX 0.4.35
|
||||
-
|
||||
* :doc:`Documentation <../jax-maxtext>`
|
||||
* :doc:`Documentation <jax-maxtext-v25.5>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
|
||||
|
||||
* - 25.4
|
||||
|
||||
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic
|
||||
|
||||
- Multi-node support
|
||||
|
||||
.. _amd-maxtext-model-support:
|
||||
.. _amd-maxtext-model-support-v254:
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
|
||||
|
||||
@@ -0,0 +1,385 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using JAX MaxText for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with MaxText for ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm JAX MaxText
|
||||
training performance documentation. See :doc:`../jax-maxtext` for the latest version.
|
||||
|
||||
MaxText is a high-performance, open-source framework built on the Google JAX
|
||||
machine learning library to train LLMs at scale. The MaxText framework for
|
||||
ROCm is an optimized fork of the upstream
|
||||
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
||||
on AMD MI300X series accelerators.
|
||||
|
||||
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
|
||||
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||
It includes the following software components:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| JAX | 0.4.35 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10.12 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.12.0.dev0+b8b92dc |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | 0.13.0-ae9c477a |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
Supported features and models
|
||||
=============================
|
||||
|
||||
MaxText provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- Flash Attention (FA) 3
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Multi-node support
|
||||
|
||||
.. _amd-maxtext-model-support-v255:
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
|
||||
* Llama 3.3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* Llama 3 8B
|
||||
|
||||
* Llama 3 70B
|
||||
|
||||
* Llama 2 7B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* DeepSeek-V2-Lite
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
Unsupported features
|
||||
--------------------
|
||||
|
||||
Currently, MaxText's default packed input format is not supported. Using this format
|
||||
with the current Docker image results in incorrect attention calculations
|
||||
across different input sequences. Support for packed input format is planned for a future release.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
as follows. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
.. _amd-maxtext-multi-node-setup-v255:
|
||||
|
||||
Multi-node setup
|
||||
----------------
|
||||
|
||||
For multi-node environments, ensure you have all the necessary packages for
|
||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
||||
|
||||
1. Install the following packages to build and install the RDMA driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install iproute2 -y
|
||||
sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
|
||||
sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
|
||||
|
||||
Refer to your NIC manufacturer's documentation for further steps on
|
||||
compiling and installing the RoCE driver. For example, for Broadcom,
|
||||
see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
|
||||
in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
|
||||
|
||||
2. Set the following environment variables.
|
||||
|
||||
a. Master address
|
||||
|
||||
Change ``localhost`` to the master node's resolvable hostname or IP address:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||
|
||||
b. Number of nodes
|
||||
|
||||
Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NNODES="${NNODES:-1}"
|
||||
|
||||
c. Node ranks
|
||||
|
||||
Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
|
||||
Node ranks should be unique across all nodes in the cluster.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NODE_RANK="${NODE_RANK:-0}"
|
||||
|
||||
d. Network interface
|
||||
|
||||
Update the network interface in the script to match your system's network interface. To
|
||||
find your network interface, run the following (outside of any Docker container):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ip a
|
||||
|
||||
Look for an active interface with an IP address in the same subnet as
|
||||
your other nodes. Then, update the following variable in the script, for
|
||||
example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||
|
||||
This variable specifies which network interface to use for inter-node communication.
|
||||
Setting this variable to the incorrect interface can result in communication failures
|
||||
or significantly reduced performance.
|
||||
|
||||
e. RDMA interface
|
||||
|
||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
||||
Then, set the RDMA interfaces to use for communication.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# If using Broadcom NIC
|
||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||
# If using Mellanox NIC
|
||||
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||
|
||||
.. _amd-maxtext-download-docker-v255:
|
||||
|
||||
Pull the Docker image
|
||||
---------------------
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/jax-training:maxtext-v25.5
|
||||
|
||||
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
||||
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
|
||||
and execute the benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
|
||||
|
||||
.. _amd-maxtext-get-started-v255:
|
||||
|
||||
Getting started
|
||||
===============
|
||||
|
||||
The following examples demonstrate how to get started with single node
|
||||
and multi-node training using the benchmarking scripts provided at
|
||||
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
||||
|
||||
.. important::
|
||||
|
||||
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
||||
|
||||
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
||||
set correctly and points to your Hugging Face cache directory. Refer to the
|
||||
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
||||
for more detailed instructions.
|
||||
|
||||
Single node training benchmarking examples
|
||||
------------------------------------------
|
||||
|
||||
* Example 1: Single node training with Llama 2 7B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
|
||||
|
||||
* Example 2: Single node training with Llama 2 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
|
||||
|
||||
* Example 3: Single node training with Llama 3 8B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
|
||||
|
||||
* Example 4: Single node training with Llama 3 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
|
||||
|
||||
* Example 5: Single node training with Llama 3.3 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
|
||||
|
||||
* Example 6: Single node training with DeepSeek V2 16B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
|
||||
|
||||
Run the single node training benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
|
||||
|
||||
.. note::
|
||||
|
||||
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
|
||||
the tokens/s as a performance indicator.
|
||||
|
||||
Multi-node training benchmarking examples
|
||||
-----------------------------------------
|
||||
|
||||
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
|
||||
own cluster setup.
|
||||
|
||||
* Example 1: Multi-node training with Llama 2 7B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama2_7b_multinode.sh
|
||||
|
||||
* Example 2: Multi-node training with Llama 2 70B
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama2_70b_multinode.sh
|
||||
|
||||
* Example 3: Multi-node training with Llama 3 8B model
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama3_8b_multinode.sh
|
||||
|
||||
* Example 4: Multi-node training with Llama 3 70B model
|
||||
|
||||
Download the benchmarking script:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
|
||||
|
||||
Run the multi-node training benchmark. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> llama3_70b_multinode.sh
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`jax-maxtext-history` to find documentation for previous releases
|
||||
of the ``ROCm/jax-training`` Docker image.
|
||||
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
|
||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
|
||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||
|
||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
.. _amd-megatron-lm-model-support-24-12:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
.. _amd-megatron-lm-model-support-25-3:
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||
|
||||
@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
|
||||
|
||||
Multi-node training
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
||||
|
||||
- Pre-training
|
||||
|
||||
.. _amd-megatron-lm-model-support:
|
||||
.. _amd-megatron-lm-model-support-25-4:
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||
|
||||
@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
||||
.. tab-item:: Llama
|
||||
:sync: llama
|
||||
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
|
||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
|
||||
or the default ``HuggingFaceTokenizer``.
|
||||
|
||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
||||
.. tab-item:: DeepSeek V2
|
||||
:sync: deepseek
|
||||
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
|
||||
|
||||
Multi-node training
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.7
|
||||
-
|
||||
* ROCm 6.4.2
|
||||
* PyTorch 2.8.0a0+gitd06a406
|
||||
-
|
||||
* :doc:`Documentation <../pytorch-training>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
|
||||
|
||||
* - v25.6
|
||||
-
|
||||
* ROCm 6.3.4
|
||||
* PyTorch 2.8.0a0+git7d205b2
|
||||
-
|
||||
* :doc:`Documentation <../pytorch-training>`
|
||||
* :doc:`Documentation <pytorch-training-v25.6>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
||||
|
||||
* - v25.5
|
||||
|
||||
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:
|
||||
|
||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
|
||||
@@ -0,0 +1,456 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch for ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
|
||||
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.8.0a0+git7d205b2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10.17 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.14.0+2f85f5f2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0.post1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | 0.15.0-8c6919d |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support-v256:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.unified_docker.latest %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Workload</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
Some models require an external license agreement through a third party (for example, Meta).
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v256:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one GPU with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
||||
* - ``$datatype``
|
||||
- ``BF16`` or ``FP8``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% else %}
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if model_group.tag == "fine-tuning" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``finetune_fw``
|
||||
- Full weight fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- LoRA fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``finetune_qlora``
|
||||
- QLoRA fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``HF_finetune_lora``
|
||||
- LoRA fine-tuning with Hugging Face PEFT
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- All models support BF16.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
.. note::
|
||||
|
||||
{{ model.model }} currently supports the following fine-tuning methods:
|
||||
|
||||
{% for method in model.training_modes %}
|
||||
* ``{{ method }}``
|
||||
{% endfor %}
|
||||
{% if model.training_modes|length < 4 %}
|
||||
|
||||
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
||||
does not currently provide YAML configuration files for other combinations of
|
||||
model to fine-tuning method
|
||||
However, you can still configure your own YAML files to enable support for
|
||||
fine-tuning methods not listed here by following existing patterns in the
|
||||
``/workspace/torchtune/recipes/configs`` directory.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
|
||||
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.8.0a0+git7d205b2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10.17 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.14.0+2f85f5f2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0.post1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | 0.15.0-8c6919d |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = dockers[0] %}
|
||||
The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
|
||||
(``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
|
||||
@@ -38,26 +35,27 @@ Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set unified_docker = data.unified_docker.latest %}
|
||||
{% set unified_docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Workload</div>
|
||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
Some models require an external license agreement through a third party (for example, Meta).
|
||||
.. _amd-pytorch-training-supported-training-modes:
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements:
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
.. dropdown:: Supported training modes
|
||||
|
||||
To evaluate performance, the
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- Supported training modes
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
* - {{ model.model }}
|
||||
- ``{{ model.training_modes | join('``, ``') }}``
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
Some model and fine-tuning combinations are not listed. This is
|
||||
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||
doesn't provide default YAML configurations for them.
|
||||
For advanced usage, you can create a custom configuration to enable
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
.. note::
|
||||
System validation
|
||||
=================
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
System validation
|
||||
=================
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
Run training
|
||||
============
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
{% set unified_docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking:
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one GPU with the {{ model.precision }} data type on the host machine.
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Run the Docker container.
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ unified_docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"pretrain": "Benchmark pre-training.",
|
||||
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pretraining
|
||||
.. rubric:: Pre-training
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
||||
* - ``$datatype``
|
||||
- ``BF16`` or ``FP8``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% else %}
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
|
||||
To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if model_group.tag == "fine-tuning" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``finetune_fw``
|
||||
- Full weight fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- LoRA fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``finetune_qlora``
|
||||
- QLoRA fine-tuning (BF16 supported)
|
||||
|
||||
* -
|
||||
- ``HF_finetune_lora``
|
||||
- LoRA fine-tuning with Hugging Face PEFT
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- All models support BF16.
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_mode_descs = {
|
||||
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||
{% if available_modes %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||
.. note::
|
||||
|
||||
{{ model.model }} currently supports the following fine-tuning methods:
|
||||
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||
use the following torchtune commit for compatibility:
|
||||
|
||||
{% for method in model.training_modes %}
|
||||
* ``{{ method }}``
|
||||
{% endfor %}
|
||||
{% if model.training_modes|length < 4 %}
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||
|
||||
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||
.. note::
|
||||
|
||||
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||
input tensor should be smaller than max_seq_len (4096)``.
|
||||
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||
|
||||
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||
tokens in this case). You can resolve this by truncating the input or splitting
|
||||
it into smaller chunks before passing it to the model.
|
||||
|
||||
Note on reproducibility: The results in this guide are based on
|
||||
commit ``b4c98ac`` from the upstream
|
||||
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||
latest updates, you can use the main branch.
|
||||
|
||||
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
||||
does not currently provide YAML configuration files for other combinations of
|
||||
model to fine-tuning method
|
||||
However, you can still configure your own YAML files to enable support for
|
||||
fine-tuning methods not listed here by following existing patterns in the
|
||||
``/workspace/torchtune/recipes/configs`` directory.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
Pre-training
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch run_slurm_train.sh
|
||||
|
||||
Fine-tuning
|
||||
~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli login # Get access to HF Llama model space
|
||||
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch Torchtune_Multinode.sh
|
||||
|
||||
.. note::
|
||||
|
||||
Information regarding benchmark setup:
|
||||
|
||||
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||
* You can adjust the torchtune `YAML configuration file
|
||||
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||
if you're using a different model.
|
||||
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||
* Set the ``mounting_paths`` inside the SLURM script.
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
@@ -32,19 +32,23 @@ subtrees:
|
||||
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
||||
title: PyTorch compatibility
|
||||
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
||||
title: TensorFlow compatibility
|
||||
title: TensorFlow compatibility
|
||||
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
||||
title: JAX compatibility
|
||||
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
||||
title: verl compatibility
|
||||
title: verl compatibility
|
||||
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
||||
title: Stanford Megatron-LM compatibility
|
||||
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
||||
title: DGL compatibility
|
||||
title: DGL compatibility
|
||||
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||
title: Megablocks compatibility
|
||||
- file: compatibility/ml-compatibility/taichi-compatibility.rst
|
||||
title: Taichi compatibility
|
||||
title: Taichi compatibility
|
||||
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
||||
title: Ray compatibility
|
||||
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
||||
title: llama.cpp compatibility
|
||||
- file: how-to/build-rocm.rst
|
||||
title: Build ROCm from source
|
||||
|
||||
|
||||
79
tools/rocm-build/default.xml
Normal file
79
tools/rocm-build/default.xml
Normal file
@@ -0,0 +1,79 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<manifest>
|
||||
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
|
||||
<default revision="refs/tags/20250912-42"
|
||||
remote="rocm-org"
|
||||
sync-c="true"
|
||||
sync-j="4" />
|
||||
<!--list of projects for ROCm-->
|
||||
<project name="aqlprofile" />
|
||||
<project name="ROCR-Runtime" />
|
||||
<project name="amdsmi" />
|
||||
<project name="rdc" />
|
||||
<project name="rocm_bandwidth_test" />
|
||||
<project name="rocm_smi_lib" />
|
||||
<project name="rocm-core" />
|
||||
<project name="rocm-examples" />
|
||||
<project name="rocminfo" />
|
||||
<project name="rocprofiler" />
|
||||
<project name="rocprofiler-register" />
|
||||
<project name="rocprofiler-sdk" />
|
||||
<project name="rocprofiler-compute" />
|
||||
<project name="rocprofiler-systems" />
|
||||
<project name="roctracer" />
|
||||
<!--HIP Projects-->
|
||||
<project name="HIP" />
|
||||
<project name="hip-tests" />
|
||||
<project name="HIPIFY" />
|
||||
<project name="clr" />
|
||||
<project name="hipother" />
|
||||
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
|
||||
<project name="half" />
|
||||
<project name="llvm-project" />
|
||||
<project name="spirv-llvm-translator" />
|
||||
<!-- gdb projects -->
|
||||
<project name="ROCdbgapi" />
|
||||
<project name="ROCgdb" />
|
||||
<project name="rocr_debug_agent" />
|
||||
<!-- ROCm Libraries -->
|
||||
<project groups="mathlibs" name="AMDMIGraphX" />
|
||||
<project groups="mathlibs" name="MIVisionX" />
|
||||
<project groups="mathlibs" name="ROCmValidationSuite" />
|
||||
<project groups="mathlibs" name="composable_kernel" />
|
||||
<project groups="mathlibs" name="hipSOLVER" />
|
||||
<project groups="mathlibs" name="hipTensor" />
|
||||
<project groups="mathlibs" name="hipfort" />
|
||||
<project groups="mathlibs" name="rccl" />
|
||||
<project groups="mathlibs" name="rocAL" />
|
||||
<project groups="mathlibs" name="rocALUTION" />
|
||||
<project groups="mathlibs" name="rocDecode" />
|
||||
<project groups="mathlibs" name="rocJPEG" />
|
||||
<project groups="mathlibs" name="rocm-libraries">
|
||||
<linkfile src="projects/hipcub" dest="hipCUB"/>
|
||||
<linkfile src="projects/rocprim" dest="rocPRIM"/>
|
||||
<linkfile src="projects/hiprand" dest="hipRAND"/>
|
||||
<linkfile src="projects/rocrand" dest="rocRAND"/>
|
||||
<linkfile src="projects/rocthrust" dest="rocThrust"/>
|
||||
<linkfile src="projects/hipblas-common" dest="hipBLAS-common"/>
|
||||
<linkfile src="projects/hipblaslt" dest="hipBLASLt"/>
|
||||
<linkfile src="projects/rocblas" dest="rocBLAS"/>
|
||||
<linkfile src="projects/hipsparselt" dest="hipSPARSELt"/>
|
||||
<linkfile src="projects/rocsparse" dest="rocSPARSE"/>
|
||||
<linkfile src="projects/hipsparse" dest="hipSPARSE"/>
|
||||
<linkfile src="projects/hipblas" dest="hipBLAS"/>
|
||||
<linkfile src="projects/miopen" dest="MIOpen"/>
|
||||
<linkfile src="projects/hipfft" dest="hipFFT"/>
|
||||
<linkfile src="projects/rocfft" dest="rocFFT"/>
|
||||
</project>
|
||||
<project groups="mathlibs" name="rocPyDecode" />
|
||||
<project groups="mathlibs" name="rocSHMEM" />
|
||||
<project groups="mathlibs" name="rocSOLVER" />
|
||||
<project groups="mathlibs" name="rocWMMA" />
|
||||
<project groups="mathlibs" name="rocm-cmake" />
|
||||
<project groups="mathlibs" name="rpp" />
|
||||
<project groups="mathlibs" name="TransferBench" />
|
||||
<!-- Projects for OpenMP-Extras -->
|
||||
<project name="aomp" path="openmp-extras/aomp" />
|
||||
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
|
||||
<project name="flang" path="openmp-extras/flang" />
|
||||
</manifest>
|
||||
Reference in New Issue
Block a user