mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 06:38:00 -05:00
Merge remote-tracking branch 'external/develop' into sync-develop-from-external
This commit is contained in:
@@ -86,8 +86,7 @@ jobs:
|
|||||||
value: $(Agent.BuildDirectory)/rocm
|
value: $(Agent.BuildDirectory)/rocm
|
||||||
- name: HIP_INC_DIR
|
- name: HIP_INC_DIR
|
||||||
value: $(Agent.BuildDirectory)/rocm
|
value: $(Agent.BuildDirectory)/rocm
|
||||||
pool:
|
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -33,8 +33,9 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- libmsgpack-dev
|
- libboost-filesystem-dev
|
||||||
- libboost-program-options-dev
|
- libboost-program-options-dev
|
||||||
|
- libmsgpack-dev
|
||||||
- name: pipModules
|
- name: pipModules
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -107,6 +107,7 @@ jobs:
|
|||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||||
parameters:
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
# if this artifact name is changed, please also update $ARTIFACT_URL inside miopen-get-ck-build.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
|||||||
@@ -39,4 +39,6 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
|
inputs:
|
||||||
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||||
|
|||||||
@@ -51,15 +51,15 @@ parameters:
|
|||||||
buildJobs:
|
buildJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt }
|
- { os: ubuntu2204, packageManager: apt }
|
||||||
- { os: almalinux8, packageManager: dnf }
|
- { os: almalinux8, packageManager: dnf }
|
||||||
# - name: downstreamComponentMatrix
|
- name: downstreamComponentMatrix
|
||||||
# type: object
|
type: object
|
||||||
# default:
|
default:
|
||||||
# - hipBLASLt:
|
- hipBLASLt:
|
||||||
# name: hipBLASLt
|
name: hipBLASLt
|
||||||
# sparseCheckoutDir: projects/hipblaslt
|
sparseCheckoutDir: projects/hipblaslt
|
||||||
# skipUnifiedBuild: 'false'
|
skipUnifiedBuild: 'false'
|
||||||
# buildDependsOn:
|
buildDependsOn:
|
||||||
# - hipBLAS_common_build
|
- hipBLAS_common_build
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
@@ -122,14 +122,14 @@ jobs:
|
|||||||
# extraEnvVars:
|
# extraEnvVars:
|
||||||
# - ROCM_PATH:::/home/user/workspace/rocm
|
# - ROCM_PATH:::/home/user/workspace/rocm
|
||||||
|
|
||||||
# - ${{ if parameters.triggerDownstreamJobs }}:
|
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||||
# - ${{ each component in parameters.downstreamComponentMatrix }}:
|
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||||
# - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||||
# - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||||
# parameters:
|
parameters:
|
||||||
# checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
# sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||||
# buildDependsOn: ${{ component.buildDependsOn }}
|
buildDependsOn: ${{ component.buildDependsOn }}
|
||||||
# downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||||
# triggerDownstreamJobs: true
|
triggerDownstreamJobs: true
|
||||||
# unifiedBuild: ${{ parameters.unifiedBuild }}
|
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||||
|
|||||||
@@ -77,28 +77,28 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
buildJobs:
|
buildJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { pool: rocm-ci_ultra_build_pool, os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx942 }
|
- { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx90a }
|
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||||
testJobs:
|
testJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
# - name: downstreamComponentMatrix
|
- name: downstreamComponentMatrix
|
||||||
# type: object
|
type: object
|
||||||
# default:
|
default:
|
||||||
# - rocBLAS:
|
- rocBLAS:
|
||||||
# name: rocBLAS
|
name: rocBLAS
|
||||||
# sparseCheckoutDir: projects/rocblas
|
sparseCheckoutDir: projects/rocblas
|
||||||
# skipUnifiedBuild: 'false'
|
skipUnifiedBuild: 'false'
|
||||||
# buildDependsOn:
|
buildDependsOn:
|
||||||
# - hipBLASLt_build
|
- hipBLASLt_build
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
@@ -121,7 +121,7 @@ jobs:
|
|||||||
value: $(Agent.BuildDirectory)/rocm
|
value: $(Agent.BuildDirectory)/rocm
|
||||||
- name: DAY_STRING
|
- name: DAY_STRING
|
||||||
value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
|
value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
|
||||||
pool: ${{ variables.ULTRA_BUILD_POOL }}
|
pool: ${{ job.pool }}
|
||||||
${{ if eq(job.os, 'almalinux8') }}:
|
${{ if eq(job.os, 'almalinux8') }}:
|
||||||
container:
|
container:
|
||||||
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
image: rocmexternalcicd.azurecr.io/manylinux228:latest
|
||||||
@@ -140,6 +140,10 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
@@ -156,18 +160,15 @@ jobs:
|
|||||||
script: |
|
script: |
|
||||||
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
|
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
|
||||||
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
|
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
|
||||||
# hipBLASLt has a script for gtest and lapack
|
|
||||||
# https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
|
|
||||||
# $(Agent.BuildDirectory)/deps is a temporary folder for the build process
|
|
||||||
# $(Agent.BuildDirectory)/s/deps is part of the hipBLASLt repo
|
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Build and install external dependencies
|
displayName: Build and install LAPACK
|
||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
script: |
|
script: |
|
||||||
mkdir -p $(Agent.BuildDirectory)/deps
|
mkdir -p $(Agent.BuildDirectory)/temp-deps
|
||||||
cd $(Agent.BuildDirectory)/deps
|
cd $(Agent.BuildDirectory)/temp-deps
|
||||||
cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
|
# position-independent LAPACK is required for almalinux8 builds
|
||||||
|
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
|
||||||
make
|
make
|
||||||
sudo make install
|
sudo make install
|
||||||
- script: |
|
- script: |
|
||||||
@@ -187,7 +188,7 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||||
-DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
|
-DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
||||||
@@ -244,6 +245,7 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
@@ -280,14 +282,14 @@ jobs:
|
|||||||
environment: test
|
environment: test
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
|
||||||
# - ${{ if parameters.triggerDownstreamJobs }}:
|
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||||
# - ${{ each component in parameters.downstreamComponentMatrix }}:
|
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||||
# - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||||
# - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||||
# parameters:
|
parameters:
|
||||||
# checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
# sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||||
# buildDependsOn: ${{ component.buildDependsOn }}
|
buildDependsOn: ${{ component.buildDependsOn }}
|
||||||
# downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||||
# triggerDownstreamJobs: true
|
triggerDownstreamJobs: true
|
||||||
# unifiedBuild: ${{ parameters.unifiedBuild }}
|
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||||
|
|||||||
@@ -156,6 +156,7 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -70,8 +70,7 @@ jobs:
|
|||||||
variables:
|
variables:
|
||||||
- group: common
|
- group: common
|
||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
pool:
|
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -1,10 +1,29 @@
|
|||||||
parameters:
|
parameters:
|
||||||
|
- name: componentName
|
||||||
|
type: string
|
||||||
|
default: hipSPARSE
|
||||||
- name: checkoutRepo
|
- name: checkoutRepo
|
||||||
type: string
|
type: string
|
||||||
default: 'self'
|
default: 'self'
|
||||||
- name: checkoutRef
|
- name: checkoutRef
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
|
# monorepo related parameters
|
||||||
|
- name: sparseCheckoutDir
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: triggerDownstreamJobs
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
- name: downstreamAggregateNames
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: buildDependsOn
|
||||||
|
type: object
|
||||||
|
default: null
|
||||||
|
- name: unifiedBuild
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
# set to true if doing full build of ROCm stack
|
# set to true if doing full build of ROCm stack
|
||||||
# and dependencies are pulled from same pipeline
|
# and dependencies are pulled from same pipeline
|
||||||
- name: aggregatePipeline
|
- name: aggregatePipeline
|
||||||
@@ -14,13 +33,11 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- ninja-build
|
|
||||||
- libboost-program-options-dev
|
|
||||||
- googletest
|
|
||||||
- libfftw3-dev
|
|
||||||
- git
|
|
||||||
- gfortran
|
- gfortran
|
||||||
- libgtest-dev
|
- git
|
||||||
|
- libboost-program-options-dev
|
||||||
|
- libfftw3-dev
|
||||||
|
- ninja-build
|
||||||
- python3-pip
|
- python3-pip
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
@@ -49,19 +66,31 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
buildJobs:
|
buildJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- gfx90a:
|
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
target: gfx90a
|
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||||
|
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
testJobs:
|
testJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- gfx90a:
|
- name: downstreamComponentMatrix
|
||||||
target: gfx90a
|
type: object
|
||||||
|
default:
|
||||||
|
- hipSPARSELt:
|
||||||
|
name: hipSPARSELt
|
||||||
|
sparseCheckoutDir: projects/hipsparselt
|
||||||
|
skipUnifiedBuild: 'false'
|
||||||
|
buildDependsOn:
|
||||||
|
- hipSPARSE_build
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
- job: hipSPARSE_build_${{ job.target }}
|
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
|
${{ if parameters.buildDependsOn }}:
|
||||||
|
dependsOn:
|
||||||
|
- ${{ each build in parameters.buildDependsOn }}:
|
||||||
|
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||||
variables:
|
variables:
|
||||||
- group: common
|
- group: common
|
||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
@@ -73,42 +102,57 @@ jobs:
|
|||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
packageManager: ${{ job.packageManager }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
|
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
|
|
||||||
-DBUILD_CLIENTS_TESTS=ON
|
-DBUILD_CLIENTS_TESTS=ON
|
||||||
-DBUILD_CLIENTS_SAMPLES=OFF
|
-DBUILD_CLIENTS_SAMPLES=OFF
|
||||||
-GNinja
|
-GNinja
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||||
parameters:
|
parameters:
|
||||||
artifactName: hipSPARSE
|
componentName: ${{ parameters.componentName }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
artifactName: hipSPARSE
|
componentName: ${{ parameters.componentName }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
publish: false
|
publish: false
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
|
||||||
parameters:
|
parameters:
|
||||||
sourceDir: $(Build.SourcesDirectory)/build/clients
|
sourceDir: $(Agent.BuildDirectory)/s/build/clients
|
||||||
contentsString: matrices/**
|
contentsString: matrices/**
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
componentName: ${{ parameters.componentName }}
|
||||||
artifactName: testMatrices
|
artifactName: testMatrices
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||||
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
# parameters:
|
# parameters:
|
||||||
@@ -116,44 +160,65 @@ jobs:
|
|||||||
# environment: test
|
# environment: test
|
||||||
# gpuTarget: ${{ job.target }}
|
# gpuTarget: ${{ job.target }}
|
||||||
|
|
||||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||||
- job: hipSPARSE_test_${{ job.target }}
|
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||||
dependsOn: hipSPARSE_build_${{ job.target }}
|
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||||
condition:
|
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
and(succeeded(),
|
condition:
|
||||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
and(succeeded(),
|
||||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||||
eq(${{ parameters.aggregatePipeline }}, False)
|
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||||
)
|
eq(${{ parameters.aggregatePipeline }}, False)
|
||||||
variables:
|
)
|
||||||
- group: common
|
variables:
|
||||||
- template: /.azuredevops/variables-global.yml
|
- group: common
|
||||||
pool: ${{ job.target }}_test_pool
|
- template: /.azuredevops/variables-global.yml
|
||||||
workspace:
|
pool: ${{ job.target }}_test_pool
|
||||||
clean: all
|
workspace:
|
||||||
steps:
|
clean: all
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
steps:
|
||||||
parameters:
|
- checkout: none
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
parameters:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
parameters:
|
packageManager: ${{ job.packageManager }}
|
||||||
gpuTarget: ${{ job.target }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
parameters:
|
||||||
parameters:
|
preTargetFilter: ${{ parameters.componentName }}
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
gpuTarget: ${{ job.target }}
|
||||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
os: ${{ job.os }}
|
||||||
gpuTarget: ${{ job.target }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
parameters:
|
||||||
parameters:
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
componentName: hipSPARSE
|
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
gpuTarget: ${{ job.target }}
|
||||||
testExecutable: './hipsparse-test'
|
os: ${{ job.os }}
|
||||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
parameters:
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
environment: test
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
componentName: ${{ parameters.componentName }}
|
||||||
|
os: ${{ job.os }}
|
||||||
|
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||||
|
testExecutable: './hipsparse-test'
|
||||||
|
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
|
parameters:
|
||||||
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
environment: test
|
||||||
|
gpuTarget: ${{ job.target }}
|
||||||
|
|
||||||
|
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||||
|
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||||
|
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||||
|
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||||
|
parameters:
|
||||||
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||||
|
buildDependsOn: ${{ component.buildDependsOn }}
|
||||||
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||||
|
triggerDownstreamJobs: true
|
||||||
|
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||||
|
|||||||
@@ -1,10 +1,29 @@
|
|||||||
parameters:
|
parameters:
|
||||||
|
- name: componentName
|
||||||
|
type: string
|
||||||
|
default: hipSPARSELt
|
||||||
- name: checkoutRepo
|
- name: checkoutRepo
|
||||||
type: string
|
type: string
|
||||||
default: 'self'
|
default: 'self'
|
||||||
- name: checkoutRef
|
- name: checkoutRef
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
|
# monorepo related parameters
|
||||||
|
- name: sparseCheckoutDir
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: triggerDownstreamJobs
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
- name: downstreamAggregateNames
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: buildDependsOn
|
||||||
|
type: object
|
||||||
|
default: null
|
||||||
|
- name: unifiedBuild
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
# set to true if doing full build of ROCm stack
|
# set to true if doing full build of ROCm stack
|
||||||
# and dependencies are pulled from same pipeline
|
# and dependencies are pulled from same pipeline
|
||||||
- name: aggregatePipeline
|
- name: aggregatePipeline
|
||||||
@@ -56,15 +75,17 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
buildJobs:
|
buildJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
|
||||||
testJobs:
|
testJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
- job: hipSPARSELt_build_${{ job.target }}
|
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
|
${{ if parameters.buildDependsOn }}:
|
||||||
|
dependsOn:
|
||||||
|
- ${{ each build in parameters.buildDependsOn }}:
|
||||||
|
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||||
variables:
|
variables:
|
||||||
- group: common
|
- group: common
|
||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
@@ -86,17 +107,22 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
pipModules: ${{ parameters.pipModules }}
|
pipModules: ${{ parameters.pipModules }}
|
||||||
|
packageManager: ${{ job.packageManager }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
# Build and install gtest and lapack
|
# Build and install gtest and lapack
|
||||||
# $(Pipeline.Workspace)/deps is a temporary folder for the build process
|
# $(Pipeline.Workspace)/deps is a temporary folder for the build process
|
||||||
# $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
|
# $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
|
||||||
@@ -115,6 +141,7 @@ jobs:
|
|||||||
workingDirectory: $(Pipeline.Workspace)/deps
|
workingDirectory: $(Pipeline.Workspace)/deps
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
@@ -130,64 +157,80 @@ jobs:
|
|||||||
-GNinja
|
-GNinja
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
componentName: ${{ parameters.componentName }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
componentName: ${{ parameters.componentName }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||||
parameters:
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
parameters:
|
||||||
pipModules: ${{ parameters.pipModules }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
gpuTarget: ${{ job.target }}
|
pipModules: ${{ parameters.pipModules }}
|
||||||
extraCopyDirectories:
|
gpuTarget: ${{ job.target }}
|
||||||
- deps
|
extraCopyDirectories:
|
||||||
extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
|
- deps
|
||||||
extraEnvVars:
|
extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
|
||||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
extraEnvVars:
|
||||||
- TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
|
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||||
- CMAKE_CXX_COMPILER:::/home/user/workspace/rocm/llvm/bin/hipcc
|
- TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
|
||||||
- TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
|
- CMAKE_CXX_COMPILER:::/home/user/workspace/rocm/llvm/bin/hipcc
|
||||||
installLatestCMake: true
|
- TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
|
||||||
|
installLatestCMake: true
|
||||||
|
|
||||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||||
- job: hipSPARSELt_test_${{ job.target }}
|
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||||
dependsOn: hipSPARSELt_build_${{ job.target }}
|
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||||
condition:
|
timeoutInMinutes: 120
|
||||||
and(succeeded(),
|
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
condition:
|
||||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
and(succeeded(),
|
||||||
eq(${{ parameters.aggregatePipeline }}, False)
|
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||||
)
|
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||||
variables:
|
eq(${{ parameters.aggregatePipeline }}, False)
|
||||||
- group: common
|
)
|
||||||
- template: /.azuredevops/variables-global.yml
|
variables:
|
||||||
pool: ${{ job.target }}_test_pool
|
- group: common
|
||||||
workspace:
|
- template: /.azuredevops/variables-global.yml
|
||||||
clean: all
|
pool: ${{ job.target }}_test_pool
|
||||||
steps:
|
workspace:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
clean: all
|
||||||
parameters:
|
steps:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
parameters:
|
||||||
parameters:
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
gpuTarget: ${{ job.target }}
|
packageManager: ${{ job.packageManager }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
parameters:
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
parameters:
|
||||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
preTargetFilter: ${{ parameters.componentName }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
componentName: hipSPARSELt
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||||
testExecutable: './hipsparselt-test'
|
gpuTarget: ${{ job.target }}
|
||||||
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
parameters:
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
pipModules: ${{ parameters.pipModules }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
environment: test
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
componentName: ${{ parameters.componentName }}
|
||||||
|
os: ${{ job.os }}
|
||||||
|
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||||
|
testExecutable: './hipsparselt-test'
|
||||||
|
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
|
parameters:
|
||||||
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
pipModules: ${{ parameters.pipModules }}
|
||||||
|
environment: test
|
||||||
|
gpuTarget: ${{ job.target }}
|
||||||
|
|||||||
@@ -67,7 +67,6 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
skipLlvmSymlink: true
|
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||||
|
|||||||
@@ -86,8 +86,7 @@ jobs:
|
|||||||
variables:
|
variables:
|
||||||
- group: common
|
- group: common
|
||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
pool:
|
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -73,8 +73,7 @@ jobs:
|
|||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
- name: HIP_ROCCLR_HOME
|
- name: HIP_ROCCLR_HOME
|
||||||
value: $(Build.BinariesDirectory)/rocm
|
value: $(Build.BinariesDirectory)/rocm
|
||||||
pool:
|
pool: ${{ variables.MEDIUM_BUILD_POOL }}
|
||||||
vmImage: ${{ variables.BASE_BUILD_POOL }}
|
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
@@ -33,17 +33,15 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- ninja-build
|
|
||||||
- python3-venv
|
|
||||||
- git
|
- git
|
||||||
- libmsgpack-dev
|
|
||||||
- gfortran
|
- gfortran
|
||||||
- libopenblas-dev
|
|
||||||
- googletest
|
|
||||||
- libgtest-dev
|
|
||||||
- wget
|
|
||||||
- python3-pip
|
|
||||||
- libdrm-dev
|
- libdrm-dev
|
||||||
|
- libmsgpack-dev
|
||||||
|
- libopenblas-dev
|
||||||
|
- ninja-build
|
||||||
|
- python3-pip
|
||||||
|
- python3-venv
|
||||||
|
- wget
|
||||||
- name: pipModules
|
- name: pipModules
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
@@ -52,18 +50,17 @@ parameters:
|
|||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- rocm-cmake
|
|
||||||
- llvm-project
|
|
||||||
- ROCR-Runtime
|
|
||||||
- clr
|
|
||||||
- rocminfo
|
|
||||||
- rocprofiler-register
|
|
||||||
- rocm_smi_lib
|
|
||||||
- rocm-core
|
|
||||||
- aomp
|
- aomp
|
||||||
- aomp-extras
|
- clr
|
||||||
- hipBLAS-common
|
- hipBLAS-common
|
||||||
- hipBLASLt
|
- hipBLASLt
|
||||||
|
- llvm-project
|
||||||
|
- rocm-cmake
|
||||||
|
- rocm-core
|
||||||
|
- rocm_smi_lib
|
||||||
|
- rocminfo
|
||||||
|
- rocprofiler-register
|
||||||
|
- ROCR-Runtime
|
||||||
- roctracer
|
- roctracer
|
||||||
- name: rocmTestDependencies
|
- name: rocmTestDependencies
|
||||||
type: object
|
type: object
|
||||||
@@ -86,32 +83,38 @@ parameters:
|
|||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx942 }
|
- { os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx90a }
|
- { os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||||
testJobs:
|
testJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
# - name: downstreamComponentMatrix
|
- name: downstreamComponentMatrix
|
||||||
# type: object
|
type: object
|
||||||
# default:
|
default:
|
||||||
# # rocSOLVER depends on both rocBLAS and rocPRIM
|
- rocSPARSE:
|
||||||
# # for a unified build, rocBLAS will be the one to call rocSOLVER
|
name: rocSPARSE
|
||||||
# - rocSOLVER:
|
sparseCheckoutDir: projects/rocsparse
|
||||||
# name: rocSOLVER
|
skipUnifiedBuild: 'false'
|
||||||
# sparseCheckoutDir: projects/rocsolver
|
buildDependsOn:
|
||||||
# skipUnifiedBuild: 'false'
|
- rocBLAS_build
|
||||||
# buildDependsOn:
|
# rocSOLVER depends on both rocBLAS and rocPRIM
|
||||||
# - rocBLAS_build
|
# for a unified build, rocBLAS will be the one to call rocSOLVER
|
||||||
# unifiedBuild:
|
# - rocSOLVER:
|
||||||
# downstreamAggregateNames: rocBLAS+rocPRIM
|
# name: rocSOLVER
|
||||||
# buildDependsOn:
|
# sparseCheckoutDir: projects/rocsolver
|
||||||
# - rocBLAS_build
|
# skipUnifiedBuild: 'false'
|
||||||
# - rocPRIM_build
|
# buildDependsOn:
|
||||||
|
# - rocBLAS_build
|
||||||
|
# unifiedBuild:
|
||||||
|
# downstreamAggregateNames: rocBLAS+rocPRIM
|
||||||
|
# buildDependsOn:
|
||||||
|
# - rocBLAS_build
|
||||||
|
# - rocPRIM_build
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
@@ -151,6 +154,12 @@ jobs:
|
|||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
|
||||||
|
parameters:
|
||||||
|
os: ${{ job.os }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
@@ -164,21 +173,12 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
|
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
|
||||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
|
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
|
||||||
-DGPU_TARGETS=${{ job.target }}
|
-DGPU_TARGETS=${{ job.target }}
|
||||||
-DTensile_CODE_OBJECT_VERSION=default
|
|
||||||
-DTensile_LOGIC=asm_full
|
|
||||||
-DTensile_SEPARATE_ARCHITECTURES=ON
|
|
||||||
-DTensile_LAZY_LIBRARY_LOADING=ON
|
|
||||||
-DTensile_LIBRARY_FORMAT=msgpack
|
|
||||||
-DBUILD_CLIENTS_TESTS=ON
|
-DBUILD_CLIENTS_TESTS=ON
|
||||||
-DBUILD_CLIENTS_BENCHMARKS=OFF
|
|
||||||
-DBUILD_CLIENTS_SAMPLES=OFF
|
|
||||||
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
|
||||||
-GNinja
|
-GNinja
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||||
parameters:
|
parameters:
|
||||||
@@ -208,6 +208,7 @@ jobs:
|
|||||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||||
|
timeoutInMinutes: 120
|
||||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
condition:
|
condition:
|
||||||
and(succeeded(),
|
and(succeeded(),
|
||||||
@@ -222,6 +223,7 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
@@ -258,18 +260,18 @@ jobs:
|
|||||||
environment: test
|
environment: test
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
|
||||||
# - ${{ if parameters.triggerDownstreamJobs }}:
|
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||||
# - ${{ each component in parameters.downstreamComponentMatrix }}:
|
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||||
# - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||||
# - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||||
# parameters:
|
parameters:
|
||||||
# checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
# sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||||
# triggerDownstreamJobs: true
|
triggerDownstreamJobs: true
|
||||||
# unifiedBuild: ${{ parameters.unifiedBuild }}
|
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||||
# ${{ if parameters.unifiedBuild }}:
|
${{ if parameters.unifiedBuild }}:
|
||||||
# buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
|
buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
|
||||||
# downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
|
||||||
# ${{ else }}:
|
${{ else }}:
|
||||||
# buildDependsOn: ${{ component.buildDependsOn }}
|
buildDependsOn: ${{ component.buildDependsOn }}
|
||||||
# downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||||
|
|||||||
@@ -166,6 +166,7 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ parameters:
|
|||||||
- numpy
|
- numpy
|
||||||
- tomli
|
- tomli
|
||||||
- scipy
|
- scipy
|
||||||
|
- pybind11
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -210,7 +210,7 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
componentName: ${{ parameters.componentName }}
|
componentName: ${{ parameters.componentName }}
|
||||||
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
|
testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
|
||||||
extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
|
extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ parameters:
|
|||||||
- clr
|
- clr
|
||||||
- llvm-project
|
- llvm-project
|
||||||
- rocDecode
|
- rocDecode
|
||||||
|
- rocJPEG
|
||||||
- rocm-cmake
|
- rocm-cmake
|
||||||
- rocm-core
|
- rocm-core
|
||||||
- rocminfo
|
- rocminfo
|
||||||
@@ -192,9 +193,9 @@ jobs:
|
|||||||
inputs:
|
inputs:
|
||||||
itemPattern: '**/*.whl'
|
itemPattern: '**/*.whl'
|
||||||
targetPath: $(Agent.BuildDirectory)
|
targetPath: $(Agent.BuildDirectory)
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
gpuTarget: ${{ job.target }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
@@ -221,25 +222,17 @@ jobs:
|
|||||||
- task: CMake@1
|
- task: CMake@1
|
||||||
displayName: 'rocPyDecode Test CMake Flags'
|
displayName: 'rocPyDecode Test CMake Flags'
|
||||||
inputs:
|
inputs:
|
||||||
|
workingDirectory: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
|
||||||
cmakeArgs: >-
|
cmakeArgs: >-
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(PYTHON_USER_SITE)/pybind11;$(PYTHON_DIST_PACKAGES)/pybind11;$(PYBIND11_PATH)
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(PYTHON_USER_SITE)/pybind11;$(PYTHON_DIST_PACKAGES)/pybind11;$(PYBIND11_PATH)
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DGPU_TARGETS=${{ job.target }}
|
-DGPU_TARGETS=${{ job.target }}
|
||||||
..
|
.
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
parameters:
|
parameters:
|
||||||
componentName: rocPyDecode
|
componentName: rocPyDecode
|
||||||
testDir: $(Build.SourcesDirectory)/build
|
testDir: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
|
||||||
# sudo required for pip install but screws up permissions for next pipeline run
|
|
||||||
- task: Bash@3
|
|
||||||
displayName: Clean up test environment
|
|
||||||
condition: always()
|
|
||||||
inputs:
|
|
||||||
targetType: inline
|
|
||||||
script: |
|
|
||||||
pip uninstall -y rocPyDecode
|
|
||||||
pip uninstall -y hip-python
|
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -33,13 +33,11 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- cmake
|
- cmake
|
||||||
- ninja-build
|
|
||||||
- libsuitesparse-dev
|
|
||||||
- gfortran
|
- gfortran
|
||||||
- libfmt-dev
|
|
||||||
- git
|
- git
|
||||||
- googletest
|
- libfmt-dev
|
||||||
- libgtest-dev
|
- libsuitesparse-dev
|
||||||
|
- ninja-build
|
||||||
- python3-pip
|
- python3-pip
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
@@ -75,13 +73,13 @@ parameters:
|
|||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx942 }
|
- { os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx90a }
|
- { os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||||
# - { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||||
testJobs:
|
testJobs:
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
@@ -119,6 +117,10 @@ jobs:
|
|||||||
targetType: inline
|
targetType: inline
|
||||||
script: git clone --depth 1 --branch v3.9.1 https://github.com/Reference-LAPACK/lapack
|
script: git clone --depth 1 --branch v3.9.1 https://github.com/Reference-LAPACK/lapack
|
||||||
workingDirectory: '$(Build.SourcesDirectory)'
|
workingDirectory: '$(Build.SourcesDirectory)'
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
@@ -134,6 +136,7 @@ jobs:
|
|||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||||
-DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
|
-DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
|
||||||
-DBUILD_TESTING=OFF
|
-DBUILD_TESTING=OFF
|
||||||
-DCBLAS=ON
|
-DCBLAS=ON
|
||||||
@@ -146,7 +149,7 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install;$(Agent.BuildDirectory)/vendor
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
|
||||||
-DAMDGPU_TARGETS=${{ job.target }}
|
-DAMDGPU_TARGETS=${{ job.target }}
|
||||||
@@ -191,6 +194,7 @@ jobs:
|
|||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
steps:
|
steps:
|
||||||
|
- checkout: none
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -1,10 +1,29 @@
|
|||||||
parameters:
|
parameters:
|
||||||
|
- name: componentName
|
||||||
|
type: string
|
||||||
|
default: rocSPARSE
|
||||||
- name: checkoutRepo
|
- name: checkoutRepo
|
||||||
type: string
|
type: string
|
||||||
default: 'self'
|
default: 'self'
|
||||||
- name: checkoutRef
|
- name: checkoutRef
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
|
# monorepo related parameters
|
||||||
|
- name: sparseCheckoutDir
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: triggerDownstreamJobs
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
- name: downstreamAggregateNames
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
- name: buildDependsOn
|
||||||
|
type: object
|
||||||
|
default: null
|
||||||
|
- name: unifiedBuild
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
# set to true if doing full build of ROCm stack
|
# set to true if doing full build of ROCm stack
|
||||||
# and dependencies are pulled from same pipeline
|
# and dependencies are pulled from same pipeline
|
||||||
- name: aggregatePipeline
|
- name: aggregatePipeline
|
||||||
@@ -13,27 +32,25 @@ parameters:
|
|||||||
- name: aptPackages
|
- name: aptPackages
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- python3-pip
|
|
||||||
- cmake
|
- cmake
|
||||||
- ninja-build
|
|
||||||
- libboost-program-options-dev
|
|
||||||
- googletest
|
|
||||||
- libfftw3-dev
|
|
||||||
- git
|
|
||||||
- gfortran
|
- gfortran
|
||||||
- libgtest-dev
|
- git
|
||||||
|
- libboost-program-options-dev
|
||||||
- libdrm-dev
|
- libdrm-dev
|
||||||
|
- libfftw3-dev
|
||||||
|
- ninja-build
|
||||||
|
- python3-pip
|
||||||
- name: rocmDependencies
|
- name: rocmDependencies
|
||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
- rocm-cmake
|
|
||||||
- llvm-project
|
|
||||||
- ROCR-Runtime
|
|
||||||
- clr
|
- clr
|
||||||
|
- llvm-project
|
||||||
- rocBLAS
|
- rocBLAS
|
||||||
|
- rocm-cmake
|
||||||
- rocminfo
|
- rocminfo
|
||||||
- rocPRIM
|
- rocPRIM
|
||||||
- rocprofiler-register
|
- rocprofiler-register
|
||||||
|
- ROCR-Runtime
|
||||||
- roctracer
|
- roctracer
|
||||||
- name: rocmTestDependencies
|
- name: rocmTestDependencies
|
||||||
type: object
|
type: object
|
||||||
@@ -52,19 +69,39 @@ parameters:
|
|||||||
type: object
|
type: object
|
||||||
default:
|
default:
|
||||||
buildJobs:
|
buildJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- gfx90a:
|
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||||
target: gfx90a
|
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||||
|
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||||
testJobs:
|
testJobs:
|
||||||
- gfx942:
|
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||||
target: gfx942
|
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||||
- gfx90a:
|
- name: downstreamComponentMatrix
|
||||||
target: gfx90a
|
type: object
|
||||||
|
default:
|
||||||
|
- hipSPARSE:
|
||||||
|
name: hipSPARSE
|
||||||
|
sparseCheckoutDir: projects/hipsparse
|
||||||
|
skipUnifiedBuild: 'false'
|
||||||
|
buildDependsOn:
|
||||||
|
- rocSPARSE_build
|
||||||
|
# hipSOLVER depends on both rocSOLVER and rocSPARSE
|
||||||
|
# for a unified build, rocSOLVER will be the one to call hipSOLVER
|
||||||
|
# - hipSOLVER:
|
||||||
|
# name: hipSOLVER
|
||||||
|
# sparseCheckoutDir: projects/hipsolver
|
||||||
|
# skipUnifiedBuild: 'true'
|
||||||
|
# buildDependsOn:
|
||||||
|
# - rocSPARSE_build
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
- ${{ each job in parameters.jobMatrix.buildJobs }}:
|
||||||
- job: rocSPARSE_build_${{ job.target }}
|
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
|
${{ if parameters.buildDependsOn }}:
|
||||||
|
dependsOn:
|
||||||
|
- ${{ each build in parameters.buildDependsOn }}:
|
||||||
|
- ${{ build }}_${{ job.os }}_${{ job.target }}
|
||||||
variables:
|
variables:
|
||||||
- group: common
|
- group: common
|
||||||
- template: /.azuredevops/variables-global.yml
|
- template: /.azuredevops/variables-global.yml
|
||||||
@@ -77,22 +114,32 @@ jobs:
|
|||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
packageManager: ${{ job.packageManager }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||||
|
parameters:
|
||||||
|
dependencyList:
|
||||||
|
- gtest
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
parameters:
|
parameters:
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
os: ${{ job.os }}
|
||||||
extraBuildFlags: >-
|
extraBuildFlags: >-
|
||||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||||
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
|
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
|
||||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
|
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
|
||||||
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||||
-DCMAKE_BUILD_TYPE=Release
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
-DAMDGPU_TARGETS=${{ job.target }}
|
-DAMDGPU_TARGETS=${{ job.target }}
|
||||||
@@ -103,68 +150,94 @@ jobs:
|
|||||||
-GNinja
|
-GNinja
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
|
||||||
parameters:
|
parameters:
|
||||||
artifactName: rocSPARSE
|
componentName: ${{ parameters.componentName }}
|
||||||
|
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
artifactName: rocSPARSE
|
componentName: ${{ parameters.componentName }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
publish: false
|
publish: false
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
|
||||||
parameters:
|
parameters:
|
||||||
sourceDir: $(Build.SourcesDirectory)/build/clients
|
sourceDir: $(Agent.BuildDirectory)/s/build/clients
|
||||||
contentsString: matrices/**
|
contentsString: matrices/**
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
componentName: ${{ parameters.componentName }}
|
||||||
artifactName: testMatrices
|
artifactName: testMatrices
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- ${{ if eq(job.os, 'ubuntu2204') }}:
|
||||||
parameters:
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
extraEnvVars:
|
gpuTarget: ${{ job.target }}
|
||||||
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
extraEnvVars:
|
||||||
|
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
|
||||||
|
|
||||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||||
- job: rocSPARSE_test_${{ job.target }}
|
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||||
timeoutInMinutes: 90
|
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
|
||||||
dependsOn: rocSPARSE_build_${{ job.target }}
|
timeoutInMinutes: 120
|
||||||
condition:
|
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||||
and(succeeded(),
|
condition:
|
||||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
and(succeeded(),
|
||||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
|
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||||
eq(${{ parameters.aggregatePipeline }}, False)
|
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||||
)
|
eq(${{ parameters.aggregatePipeline }}, False)
|
||||||
variables:
|
)
|
||||||
- group: common
|
variables:
|
||||||
- template: /.azuredevops/variables-global.yml
|
- group: common
|
||||||
pool: ${{ job.target }}_test_pool
|
- template: /.azuredevops/variables-global.yml
|
||||||
workspace:
|
pool: ${{ job.target }}_test_pool
|
||||||
clean: all
|
workspace:
|
||||||
steps:
|
clean: all
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
steps:
|
||||||
parameters:
|
- checkout: none
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
parameters:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
parameters:
|
packageManager: ${{ job.packageManager }}
|
||||||
gpuTarget: ${{ job.target }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
parameters:
|
||||||
parameters:
|
preTargetFilter: ${{ parameters.componentName }}
|
||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
gpuTarget: ${{ job.target }}
|
||||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
os: ${{ job.os }}
|
||||||
gpuTarget: ${{ job.target }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
parameters:
|
||||||
parameters:
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
componentName: rocSPARSE
|
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
gpuTarget: ${{ job.target }}
|
||||||
testExecutable: './rocsparse-test'
|
os: ${{ job.os }}
|
||||||
testParameters: '--gtest_filter="*quick*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
${{ if parameters.triggerDownstreamJobs }}:
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||||
parameters:
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||||
environment: test
|
parameters:
|
||||||
gpuTarget: ${{ job.target }}
|
componentName: ${{ parameters.componentName }}
|
||||||
|
os: ${{ job.os }}
|
||||||
|
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||||
|
testExecutable: './rocsparse-test'
|
||||||
|
testParameters: '--gtest_filter="*quick*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||||
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
|
parameters:
|
||||||
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
environment: test
|
||||||
|
gpuTarget: ${{ job.target }}
|
||||||
|
|
||||||
|
- ${{ if parameters.triggerDownstreamJobs }}:
|
||||||
|
- ${{ each component in parameters.downstreamComponentMatrix }}:
|
||||||
|
- ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
|
||||||
|
- template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
|
||||||
|
parameters:
|
||||||
|
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||||
|
sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
|
||||||
|
buildDependsOn: ${{ component.buildDependsOn }}
|
||||||
|
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
|
||||||
|
triggerDownstreamJobs: true
|
||||||
|
unifiedBuild: ${{ parameters.unifiedBuild }}
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ jobs:
|
|||||||
parameters:
|
parameters:
|
||||||
componentName: rocm-examples
|
componentName: rocm-examples
|
||||||
testDir: $(Build.SourcesDirectory)/build
|
testDir: $(Build.SourcesDirectory)/build
|
||||||
testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "rocfft_callback"'
|
testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||||
parameters:
|
parameters:
|
||||||
aptPackages: ${{ parameters.aptPackages }}
|
aptPackages: ${{ parameters.aptPackages }}
|
||||||
|
|||||||
@@ -67,7 +67,6 @@ jobs:
|
|||||||
checkoutRef: ${{ parameters.checkoutRef }}
|
checkoutRef: ${{ parameters.checkoutRef }}
|
||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
skipLlvmSymlink: true
|
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
|
||||||
parameters:
|
parameters:
|
||||||
|
|||||||
@@ -407,7 +407,6 @@ jobs:
|
|||||||
dependencyList: ${{ parameters.rocmTestDependencies }}
|
dependencyList: ${{ parameters.rocmTestDependencies }}
|
||||||
gpuTarget: $(JOB_GPU_TARGET)
|
gpuTarget: $(JOB_GPU_TARGET)
|
||||||
dependencySource: staging
|
dependencySource: staging
|
||||||
skipLlvmSymlink: true
|
|
||||||
# get sources to run test scripts
|
# get sources to run test scripts
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: git clone upstream pytorch
|
displayName: git clone upstream pytorch
|
||||||
|
|||||||
@@ -119,7 +119,6 @@ jobs:
|
|||||||
dependencyList: ${{ parameters.rocmDependencies }}
|
dependencyList: ${{ parameters.rocmDependencies }}
|
||||||
os: ${{ job.os }}
|
os: ${{ job.os }}
|
||||||
gpuTarget: ${{ job.target }}
|
gpuTarget: ${{ job.target }}
|
||||||
skipLibraryLinking: true
|
|
||||||
- script: df -h
|
- script: df -h
|
||||||
displayName: System disk space after ROCm
|
displayName: System disk space after ROCm
|
||||||
- script: du -sh $(Agent.BuildDirectory)/rocm
|
- script: du -sh $(Agent.BuildDirectory)/rocm
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ parameters:
|
|||||||
- name: fileFilter
|
- name: fileFilter
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
|
- name: extractAndDeleteFiles
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
# set to true if doing full build of ROCm stack
|
# set to true if doing full build of ROCm stack
|
||||||
# and dependencies are pulled from same pipeline
|
# and dependencies are pulled from same pipeline
|
||||||
- name: aggregatePipeline
|
- name: aggregatePipeline
|
||||||
@@ -22,34 +25,32 @@ steps:
|
|||||||
- task: DownloadPipelineArtifact@2
|
- task: DownloadPipelineArtifact@2
|
||||||
displayName: Download ${{ parameters.componentName }}
|
displayName: Download ${{ parameters.componentName }}
|
||||||
inputs:
|
inputs:
|
||||||
|
itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
|
||||||
|
targetPath: '$(Pipeline.Workspace)/d'
|
||||||
|
allowPartiallySucceededBuilds: true
|
||||||
${{ if parameters.aggregatePipeline }}:
|
${{ if parameters.aggregatePipeline }}:
|
||||||
buildType: 'current'
|
buildType: 'current'
|
||||||
itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
|
|
||||||
allowPartiallySucceededBuilds: true
|
|
||||||
targetPath: '$(Pipeline.Workspace)/d'
|
|
||||||
${{ else }}:
|
${{ else }}:
|
||||||
buildType: 'specific'
|
buildType: 'specific'
|
||||||
project: ROCm-CI
|
project: ROCm-CI
|
||||||
specificBuildWithTriggering: true
|
specificBuildWithTriggering: true
|
||||||
allowPartiallySucceededBuilds: true
|
|
||||||
definition: ${{ parameters.pipelineId }}
|
definition: ${{ parameters.pipelineId }}
|
||||||
itemPattern: '**/*${{ parameters.fileFilter }}*'
|
|
||||||
targetPath: '$(Pipeline.Workspace)/d'
|
|
||||||
branchName: refs/heads/${{ parameters.branchName }}
|
branchName: refs/heads/${{ parameters.branchName }}
|
||||||
${{ if eq(parameters.componentName, 'aomp') }}:
|
${{ if eq(parameters.componentName, 'aomp') }}:
|
||||||
buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
|
buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
|
||||||
${{ else }}:
|
${{ else }}:
|
||||||
buildVersionToDownload: latestFromBranch
|
buildVersionToDownload: latestFromBranch
|
||||||
- task: ExtractFiles@1
|
- ${{ if eq(parameters.extractAndDeleteFiles, true) }}:
|
||||||
displayName: Extract ${{ parameters.componentName }}
|
- task: ExtractFiles@1
|
||||||
inputs:
|
displayName: Extract ${{ parameters.componentName }}
|
||||||
archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
|
inputs:
|
||||||
destinationFolder: '$(Agent.BuildDirectory)/rocm'
|
archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
|
||||||
cleanDestinationFolder: false
|
destinationFolder: '$(Agent.BuildDirectory)/rocm'
|
||||||
overwriteExistingFiles: true
|
cleanDestinationFolder: false
|
||||||
- task: DeleteFiles@1
|
overwriteExistingFiles: true
|
||||||
displayName: Cleanup Compressed ${{ parameters.componentName }}
|
- task: DeleteFiles@1
|
||||||
inputs:
|
displayName: Clean up Compressed ${{ parameters.componentName }}
|
||||||
SourceFolder: '$(Pipeline.Workspace)/d'
|
inputs:
|
||||||
Contents: '**/*.tar.gz'
|
SourceFolder: '$(Pipeline.Workspace)/d'
|
||||||
RemoveDotFiles: true
|
Contents: '**/*.tar.gz'
|
||||||
|
RemoveDotFiles: true
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ steps:
|
|||||||
URL_BEGIN="https://artprodcus3.artifacts.visualstudio.com/"
|
URL_BEGIN="https://artprodcus3.artifacts.visualstudio.com/"
|
||||||
URL_MIDDLE="/_apis/artifact/"
|
URL_MIDDLE="/_apis/artifact/"
|
||||||
URL_END="/content?format=file&subPath=%2F"
|
URL_END="/content?format=file&subPath=%2F"
|
||||||
FORMATTED_JOB_NAME=$(echo $(Agent.JobName) | sed 's/ /./g; s/[-_]//g')
|
ARTIFACT_NAME="$(Agent.JobName)_$(System.JobAttempt)"
|
||||||
ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${FORMATTED_JOB_NAME}"
|
ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${ARTIFACT_NAME}"
|
||||||
ENCODED_STRING=$(echo -n "${ARTIFACT_STRING}" | base64 -w 0)
|
ENCODED_STRING=$(echo -n "${ARTIFACT_STRING}" | base64 -w 0)
|
||||||
PADDING_COUNT=$(echo -n "${ENCODED_STRING}" | awk -F= '{print NF-1}')
|
PADDING_COUNT=$(echo -n "${ENCODED_STRING}" | awk -F= '{print NF-1}')
|
||||||
if [ "$PADDING_COUNT" -gt 0 ]; then
|
if [ "$PADDING_COUNT" -gt 0 ]; then
|
||||||
|
|||||||
@@ -46,5 +46,6 @@ steps:
|
|||||||
displayName: '${{ parameters.artifactName }} Publish'
|
displayName: '${{ parameters.artifactName }} Publish'
|
||||||
retryCountOnTaskFailure: 3
|
retryCountOnTaskFailure: 3
|
||||||
inputs:
|
inputs:
|
||||||
artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
|
# if this artifact name is changed, please also update $ARTIFACT_URL inside miopen-get-ck-build.yml
|
||||||
|
artifactName: $(Agent.JobName)_$(System.JobAttempt)
|
||||||
targetPath: '$(Build.ArtifactStagingDirectory)'
|
targetPath: '$(Build.ArtifactStagingDirectory)'
|
||||||
|
|||||||
@@ -1,10 +1,15 @@
|
|||||||
parameters:
|
parameters:
|
||||||
|
- name: os
|
||||||
|
type: string
|
||||||
|
default: ubuntu2204
|
||||||
- name: repositoryUrl
|
- name: repositoryUrl
|
||||||
type: string
|
type: string
|
||||||
default: https://download.amd.com/developer/eula/aocl/aocl-4-2
|
default: https://download.amd.com/developer/eula/aocl/aocl-4-2
|
||||||
- name: packageName
|
- name: packageName
|
||||||
type: string
|
type: object
|
||||||
default: aocl-linux-gcc-4.2.0_1_amd64.deb
|
default:
|
||||||
|
ubuntu2204: aocl-linux-gcc-4.2.0_1_amd64.deb
|
||||||
|
almalinux8: aocl-linux-gcc-4.2.0-1.x86_64.rpm
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
@@ -12,16 +17,19 @@ steps:
|
|||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
workingDirectory: $(Pipeline.Workspace)
|
workingDirectory: $(Pipeline.Workspace)
|
||||||
script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName }}
|
script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName[parameters.os] }}
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Install AOCL
|
displayName: Install AOCL
|
||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
workingDirectory: $(Pipeline.Workspace)
|
workingDirectory: $(Pipeline.Workspace)
|
||||||
script: sudo apt install -y ./${{ parameters.packageName }}
|
${{ if eq(parameters.os, 'ubuntu2204') }}:
|
||||||
|
script: sudo apt install -y ./${{ parameters.packageName[parameters.os] }}
|
||||||
|
${{ elseif eq(parameters.os, 'almalinux8') }}:
|
||||||
|
script: sudo dnf install -y ./${{ parameters.packageName[parameters.os] }}
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Clean up AOCL
|
displayName: Clean up AOCL
|
||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
workingDirectory: $(Pipeline.Workspace)
|
workingDirectory: $(Pipeline.Workspace)
|
||||||
script: rm -f ${{ parameters.packageName }}
|
script: rm -f ${{ parameters.packageName[parameters.os] }}
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ parameters:
|
|||||||
libexpat-dev: expat-devel
|
libexpat-dev: expat-devel
|
||||||
libffi-dev: libffi-devel
|
libffi-dev: libffi-devel
|
||||||
libfftw3-dev: fftw-devel
|
libfftw3-dev: fftw-devel
|
||||||
|
libfmt-dev: fmt-devel
|
||||||
libgmp-dev: gmp-devel
|
libgmp-dev: gmp-devel
|
||||||
liblzma-dev: xz-devel
|
liblzma-dev: xz-devel
|
||||||
libmpfr-dev: mpfr-devel
|
libmpfr-dev: mpfr-devel
|
||||||
|
|||||||
@@ -19,16 +19,6 @@ parameters:
|
|||||||
- name: gpuTarget
|
- name: gpuTarget
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
# set to true if you're calling this template file multiple files in same pipeline
|
|
||||||
# only leave last call false to optimize sequence
|
|
||||||
- name: skipLibraryLinking
|
|
||||||
type: boolean
|
|
||||||
default: false
|
|
||||||
# set to true if llvm-project is not downloaded in a particular call
|
|
||||||
# or if you just don't want the symlink
|
|
||||||
- name: skipLlvmSymlink
|
|
||||||
type: boolean
|
|
||||||
default: false
|
|
||||||
# set to true if dlopen calls for HIP libraries are causing failures
|
# set to true if dlopen calls for HIP libraries are causing failures
|
||||||
# because they do not follow shared library symlink convention
|
# because they do not follow shared library symlink convention
|
||||||
- name: setupHIPLibrarySymlinks
|
- name: setupHIPLibrarySymlinks
|
||||||
@@ -367,6 +357,7 @@ steps:
|
|||||||
componentName: ${{ split(dependency, ':')[0] }}
|
componentName: ${{ split(dependency, ':')[0] }}
|
||||||
pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
|
pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
|
extractAndDeleteFiles: false
|
||||||
${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
|
${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
|
||||||
fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
|
fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
|
||||||
# dependencySource = staging
|
# dependencySource = staging
|
||||||
@@ -405,6 +396,7 @@ steps:
|
|||||||
componentName: ${{ dependency }}
|
componentName: ${{ dependency }}
|
||||||
pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
|
pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
|
||||||
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
aggregatePipeline: ${{ parameters.aggregatePipeline }}
|
||||||
|
extractAndDeleteFiles: false
|
||||||
${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
|
${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
|
||||||
fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
|
fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
|
||||||
${{ else }}:
|
${{ else }}:
|
||||||
@@ -430,8 +422,20 @@ steps:
|
|||||||
# default = staging
|
# default = staging
|
||||||
${{ else }}:
|
${{ else }}:
|
||||||
branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
|
branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
|
||||||
# Set link to redirect llvm folder
|
- task: ExtractFiles@1
|
||||||
- ${{ if eq(parameters.skipLlvmSymlink, false) }}:
|
displayName: Extract ROCm artifacts
|
||||||
|
inputs:
|
||||||
|
archiveFilePatterns: $(Pipeline.Workspace)/d/**/*.tar.gz
|
||||||
|
destinationFolder: $(Agent.BuildDirectory)/rocm
|
||||||
|
cleanDestinationFolder: false
|
||||||
|
overwriteExistingFiles: true
|
||||||
|
- task: DeleteFiles@1
|
||||||
|
displayName: Clean up ROCm artifacts
|
||||||
|
inputs:
|
||||||
|
SourceFolder: $(Pipeline.Workspace)/d
|
||||||
|
Contents: '**/*.tar.gz'
|
||||||
|
RemoveDotFiles: true
|
||||||
|
- ${{ if containsValue(parameters.dependencyList, 'llvm-project') }}:
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Symlink from rocm/llvm to rocm/lib/llvm
|
displayName: Symlink from rocm/llvm to rocm/lib/llvm
|
||||||
inputs:
|
inputs:
|
||||||
@@ -439,6 +443,7 @@ steps:
|
|||||||
script: |
|
script: |
|
||||||
sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
|
sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
|
||||||
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
|
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
|
||||||
|
echo "Created symlink from rocm/llvm to rocm/lib/llvm"
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Symlink executables from rocm/llvm/bin to rocm/bin
|
displayName: Symlink executables from rocm/llvm/bin to rocm/bin
|
||||||
inputs:
|
inputs:
|
||||||
@@ -446,7 +451,14 @@ steps:
|
|||||||
script: |
|
script: |
|
||||||
for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
|
for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
|
||||||
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
|
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
|
||||||
|
echo "Created symlink from rocm/llvm/bin/$file to rocm/bin/$file"
|
||||||
done
|
done
|
||||||
|
- ${{ if containsValue(parameters.dependencyList, 'rocm-core') }}:
|
||||||
|
- task: Bash@3
|
||||||
|
displayName: Print rocm/.info/version
|
||||||
|
inputs:
|
||||||
|
targetType: inline
|
||||||
|
script: cat $(Agent.BuildDirectory)/rocm/.info/version
|
||||||
# dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
|
# dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
|
||||||
# the convention is as follows:
|
# the convention is as follows:
|
||||||
# unversioned .so is a symlink to major version .so
|
# unversioned .so is a symlink to major version .so
|
||||||
@@ -483,17 +495,16 @@ steps:
|
|||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
script: ls -la1R $(Agent.BuildDirectory)/rocm
|
script: ls -la1R $(Agent.BuildDirectory)/rocm
|
||||||
- ${{ if eq(parameters.skipLibraryLinking, false) }}:
|
- task: Bash@3
|
||||||
- task: Bash@3
|
displayName: 'Link ROCm shared libraries'
|
||||||
displayName: 'Link ROCm shared libraries'
|
inputs:
|
||||||
inputs:
|
targetType: inline
|
||||||
targetType: inline
|
# OS ignores if the ROCm lib folder shows up more than once
|
||||||
# OS ignores if the ROCm lib folder shows up more than once
|
script: |
|
||||||
script: |
|
echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
||||||
echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
||||||
echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
||||||
echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
||||||
echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
|
sudo cat /etc/ld.so.conf.d/rocm-ci.conf
|
||||||
sudo cat /etc/ld.so.conf.d/rocm-ci.conf
|
sudo ldconfig -v
|
||||||
sudo ldconfig -v
|
ldconfig -p
|
||||||
ldconfig -p
|
|
||||||
|
|||||||
@@ -23,13 +23,14 @@ steps:
|
|||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
script: |
|
script: |
|
||||||
sudo apt-get install -y jq
|
${{ iif(or(eq(parameters.os, 'ubuntu2204'), eq(parameters.os, 'ubuntu2404')), 'sudo apt-get install -y jq', '') }}
|
||||||
|
|
||||||
# RESOURCES_REPOSITORIES is a runtime variable (not an env var!) that contains quotations and newlines
|
# RESOURCES_REPOSITORIES is a runtime variable (not an env var!) that contains quotations and newlines
|
||||||
# So we need to save it to a file to properly preserve its formatting and contents
|
# So we need to save it to a file to properly preserve its formatting and contents
|
||||||
cat <<EOF > resources.repositories
|
cat <<EOF > resources.repositories
|
||||||
$(RESOURCES_REPOSITORIES)
|
$(RESOURCES_REPOSITORIES)
|
||||||
EOF
|
EOF
|
||||||
|
echo "Value of resources.repositories:"
|
||||||
cat resources.repositories
|
cat resources.repositories
|
||||||
|
|
||||||
IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
|
IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
|
||||||
@@ -66,8 +67,6 @@ steps:
|
|||||||
)
|
)
|
||||||
' resources.repositories)
|
' resources.repositories)
|
||||||
|
|
||||||
manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
|
|
||||||
|
|
||||||
dependencies=()
|
dependencies=()
|
||||||
for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
|
for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
|
||||||
echo "Processing $manifest_file"
|
echo "Processing $manifest_file"
|
||||||
@@ -78,6 +77,10 @@ steps:
|
|||||||
done
|
done
|
||||||
dependencies_json=$(printf '%s\n' "${dependencies[@]}" | jq -s '.')
|
dependencies_json=$(printf '%s\n' "${dependencies[@]}" | jq -s '.')
|
||||||
|
|
||||||
|
manifest_filename="manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}"
|
||||||
|
echo "##vso[task.setvariable variable=manifest_filename]$manifest_filename"
|
||||||
|
manifest_json=$(Build.ArtifactStagingDirectory)/$manifest_filename.json
|
||||||
|
|
||||||
jq -n \
|
jq -n \
|
||||||
--argjson current "$current" \
|
--argjson current "$current" \
|
||||||
--argjson dependencies "$dependencies_json" \
|
--argjson dependencies "$dependencies_json" \
|
||||||
@@ -111,8 +114,14 @@ steps:
|
|||||||
')
|
')
|
||||||
dependencies_rows=$(echo $dependencies_rows)
|
dependencies_rows=$(echo $dependencies_rows)
|
||||||
echo "##vso[task.setvariable variable=dependencies_rows;]$dependencies_rows"
|
echo "##vso[task.setvariable variable=dependencies_rows;]$dependencies_rows"
|
||||||
|
- task: Bash@3
|
||||||
cat $manifest_json
|
displayName: Print manifest.json
|
||||||
|
condition: always()
|
||||||
|
continueOnError: true
|
||||||
|
inputs:
|
||||||
|
targetType: inline
|
||||||
|
script: |
|
||||||
|
cat $(Build.ArtifactStagingDirectory)/$(manifest_filename).json
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Create manifest.html
|
displayName: Create manifest.html
|
||||||
condition: always()
|
condition: always()
|
||||||
@@ -120,10 +129,10 @@ steps:
|
|||||||
inputs:
|
inputs:
|
||||||
targetType: inline
|
targetType: inline
|
||||||
script: |
|
script: |
|
||||||
manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
|
manifest_html="$(Build.ArtifactStagingDirectory)/$(manifest_filename).html"
|
||||||
cat <<EOF > $manifest_html
|
cat <<EOF > $manifest_html
|
||||||
<html>
|
<html>
|
||||||
<h1>Manifest</h1>
|
<h1>$(manifest_filename)</h1>
|
||||||
<h2>Current</h2>
|
<h2>Current</h2>
|
||||||
<table border="1">
|
<table border="1">
|
||||||
<tr>
|
<tr>
|
||||||
@@ -163,7 +172,7 @@ steps:
|
|||||||
continueOnError: true
|
continueOnError: true
|
||||||
inputs:
|
inputs:
|
||||||
tabName: Manifest
|
tabName: Manifest
|
||||||
reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
|
reportDir: $(Build.ArtifactStagingDirectory)/$(manifest_filename).html
|
||||||
- task: Bash@3
|
- task: Bash@3
|
||||||
displayName: Save manifest artifact file name
|
displayName: Save manifest artifact file name
|
||||||
condition: always()
|
condition: always()
|
||||||
@@ -172,5 +181,5 @@ steps:
|
|||||||
workingDirectory: $(Pipeline.Workspace)
|
workingDirectory: $(Pipeline.Workspace)
|
||||||
targetType: inline
|
targetType: inline
|
||||||
script: |
|
script: |
|
||||||
echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
|
echo "$(manifest_filename).html" >> pipelineArtifacts.txt
|
||||||
echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
|
echo "$(manifest_filename).json" >> pipelineArtifacts.txt
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ steps:
|
|||||||
script: |
|
script: |
|
||||||
AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
|
AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
|
||||||
GH_API="https://api.github.com/repos/ROCm"
|
GH_API="https://api.github.com/repos/ROCm"
|
||||||
ARTIFACT_NAME="composablekernelbuild${{ parameters.gpuTarget }}"
|
|
||||||
EXIT_CODE=0
|
EXIT_CODE=0
|
||||||
|
|
||||||
# Try to find an Azure build for the specific CK commit called out in MIOpen's requirements.txt
|
# Try to find an Azure build for the specific CK commit called out in MIOpen's requirements.txt
|
||||||
@@ -39,8 +38,15 @@ steps:
|
|||||||
echo "Found specific CK build ID: $CK_BUILD_ID"
|
echo "Found specific CK build ID: $CK_BUILD_ID"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
|
AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
|
||||||
ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
|
ARTIFACT_URL=$(curl -s $AZURE_URL | \
|
||||||
|
jq --arg gfx "${{ parameters.gpuTarget }}" '
|
||||||
|
.value
|
||||||
|
| map(select(.name | test($gfx)))
|
||||||
|
| max_by(.name | capture("_(?<dropNumber>\\d+)").dropNumber | tonumber)
|
||||||
|
| .resource.downloadUrl
|
||||||
|
' | \
|
||||||
|
tr -d '"')
|
||||||
|
|
||||||
# If using the specific CK commit and it doesn't have any valid artifacts, use latest successful CK build instead
|
# If using the specific CK commit and it doesn't have any valid artifacts, use latest successful CK build instead
|
||||||
if { [[ -z "$ARTIFACT_URL" ]] || [[ "$ARTIFACT_URL" == "null" ]]; } && [[ $EXIT_CODE -eq 0 ]]; then
|
if { [[ -z "$ARTIFACT_URL" ]] || [[ "$ARTIFACT_URL" == "null" ]]; } && [[ $EXIT_CODE -eq 0 ]]; then
|
||||||
@@ -48,8 +54,15 @@ steps:
|
|||||||
LATEST_BUILD_URL="$AZ_API/build/builds?definitions=$(COMPOSABLE_KERNEL_PIPELINE_ID)&statusFilter=completed&resultFilter=succeeded&\$top=1&api-version=7.1"
|
LATEST_BUILD_URL="$AZ_API/build/builds?definitions=$(COMPOSABLE_KERNEL_PIPELINE_ID)&statusFilter=completed&resultFilter=succeeded&\$top=1&api-version=7.1"
|
||||||
CK_BUILD_ID=$(curl -s $LATEST_BUILD_URL | jq '.value[0].id')
|
CK_BUILD_ID=$(curl -s $LATEST_BUILD_URL | jq '.value[0].id')
|
||||||
echo "Found latest CK build ID: $CK_BUILD_ID"
|
echo "Found latest CK build ID: $CK_BUILD_ID"
|
||||||
AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
|
AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
|
||||||
ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
|
ARTIFACT_URL=$(curl -s $AZURE_URL | \
|
||||||
|
jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
|
||||||
|
.value
|
||||||
|
| map(select(.name | test($os) and test($gfx)))
|
||||||
|
| max_by(.name | capture("_(?<dropNumber>\\d+)").dropNumber | tonumber)
|
||||||
|
| .resource.downloadUrl
|
||||||
|
' | \
|
||||||
|
tr -d '"')
|
||||||
EXIT_CODE=2
|
EXIT_CODE=2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -57,8 +70,8 @@ steps:
|
|||||||
wget --tries=5 --waitretry=10 --retry-connrefused -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip
|
wget --tries=5 --waitretry=10 --retry-connrefused -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip
|
||||||
unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
|
unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
|
||||||
mkdir -p $(Agent.BuildDirectory)/rocm
|
mkdir -p $(Agent.BuildDirectory)/rocm
|
||||||
tar -zxvf $(System.ArtifactsDirectory)/$ARTIFACT_NAME/*.tar.gz -C $(Agent.BuildDirectory)/rocm
|
tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
|
||||||
rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/$ARTIFACT_NAME
|
rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*
|
||||||
|
|
||||||
if [[ $EXIT_CODE -ne 0 ]]; then
|
if [[ $EXIT_CODE -ne 0 ]]; then
|
||||||
BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
|
BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
|
||||||
|
|||||||
@@ -66,11 +66,11 @@ variables:
|
|||||||
- name: HIP_TESTS_PIPELINE_ID
|
- name: HIP_TESTS_PIPELINE_ID
|
||||||
value: 233
|
value: 233
|
||||||
- name: HIPBLAS_COMMON_PIPELINE_ID
|
- name: HIPBLAS_COMMON_PIPELINE_ID
|
||||||
value: 223
|
value: 300
|
||||||
- name: HIPBLAS_PIPELINE_ID
|
- name: HIPBLAS_PIPELINE_ID
|
||||||
value: 87
|
value: 87
|
||||||
- name: HIPBLASLT_PIPELINE_ID
|
- name: HIPBLASLT_PIPELINE_ID
|
||||||
value: 112
|
value: 301
|
||||||
- name: HIPCUB_PIPELINE_ID
|
- name: HIPCUB_PIPELINE_ID
|
||||||
value: 277
|
value: 277
|
||||||
- name: HIPFFT_PIPELINE_ID
|
- name: HIPFFT_PIPELINE_ID
|
||||||
@@ -86,7 +86,7 @@ variables:
|
|||||||
- name: HIPSPARSE_PIPELINE_ID
|
- name: HIPSPARSE_PIPELINE_ID
|
||||||
value: 83
|
value: 83
|
||||||
- name: HIPSPARSELT_PIPELINE_ID
|
- name: HIPSPARSELT_PIPELINE_ID
|
||||||
value: 104
|
value: 309
|
||||||
- name: HIPTENSOR_PIPELINE_ID
|
- name: HIPTENSOR_PIPELINE_ID
|
||||||
value: 105
|
value: 105
|
||||||
- name: LLVM_PROJECT_PIPELINE_ID
|
- name: LLVM_PROJECT_PIPELINE_ID
|
||||||
@@ -104,7 +104,7 @@ variables:
|
|||||||
- name: ROCALUTION_PIPELINE_ID
|
- name: ROCALUTION_PIPELINE_ID
|
||||||
value: 89
|
value: 89
|
||||||
- name: ROCBLAS_PIPELINE_ID
|
- name: ROCBLAS_PIPELINE_ID
|
||||||
value: 85
|
value: 302
|
||||||
- name: ROCDBGAPI_PIPELINE_ID
|
- name: ROCDBGAPI_PIPELINE_ID
|
||||||
value: 135
|
value: 135
|
||||||
- name: ROCDECODE_PIPELINE_ID
|
- name: ROCDECODE_PIPELINE_ID
|
||||||
@@ -154,7 +154,7 @@ variables:
|
|||||||
- name: ROCSOLVER_PIPELINE_ID
|
- name: ROCSOLVER_PIPELINE_ID
|
||||||
value: 81
|
value: 81
|
||||||
- name: ROCSPARSE_PIPELINE_ID
|
- name: ROCSPARSE_PIPELINE_ID
|
||||||
value: 98
|
value: 314
|
||||||
- name: ROCTHRUST_PIPELINE_ID
|
- name: ROCTHRUST_PIPELINE_ID
|
||||||
value: 276
|
value: 276
|
||||||
- name: ROCTRACER_PIPELINE_ID
|
- name: ROCTRACER_PIPELINE_ID
|
||||||
|
|||||||
@@ -721,11 +721,13 @@ linearized
|
|||||||
linter
|
linter
|
||||||
linux
|
linux
|
||||||
llvm
|
llvm
|
||||||
|
lm
|
||||||
localscratch
|
localscratch
|
||||||
logits
|
logits
|
||||||
lossy
|
lossy
|
||||||
macOS
|
macOS
|
||||||
matchers
|
matchers
|
||||||
|
megatron
|
||||||
microarchitecture
|
microarchitecture
|
||||||
migraphx
|
migraphx
|
||||||
migratable
|
migratable
|
||||||
@@ -797,6 +799,7 @@ quantile
|
|||||||
quantizer
|
quantizer
|
||||||
quasirandom
|
quasirandom
|
||||||
queueing
|
queueing
|
||||||
|
qwen
|
||||||
radeon
|
radeon
|
||||||
rccl
|
rccl
|
||||||
rdc
|
rdc
|
||||||
|
|||||||
108
CHANGELOG.md
108
CHANGELOG.md
@@ -190,9 +190,8 @@ for a complete overview of this release.
|
|||||||
|
|
||||||
* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
|
* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||||
```
|
|
||||||
|
|
||||||
### **HIP** (6.4.1)
|
### **HIP** (6.4.1)
|
||||||
|
|
||||||
@@ -273,9 +272,8 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
|||||||
|
|
||||||
- Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.
|
- Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||||
```
|
|
||||||
|
|
||||||
### **ROCm Systems Profiler** (1.0.1)
|
### **ROCm Systems Profiler** (1.0.1)
|
||||||
|
|
||||||
@@ -413,9 +411,8 @@ Some workaround options are as follows:
|
|||||||
|
|
||||||
- The `pasid` field in struct `amdsmi_process_info_t` will be deprecated in a future ROCm release.
|
- The `pasid` field in struct `amdsmi_process_info_t` will be deprecated in a future ROCm release.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||||
```
|
|
||||||
|
|
||||||
### **AMDMIGraphX** (2.12.0)
|
### **AMDMIGraphX** (2.12.0)
|
||||||
|
|
||||||
@@ -1023,9 +1020,8 @@ The following lists the backward incompatible changes planned for upcoming major
|
|||||||
|
|
||||||
- Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying graphics version correctly for Instinct MI200 series, MI100 series, and RDNA3-based GPUs.
|
- Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying graphics version correctly for Instinct MI200 series, MI100 series, and RDNA3-based GPUs.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
|
||||||
```
|
|
||||||
|
|
||||||
### **ROCm Systems Profiler** (1.0.0)
|
### **ROCm Systems Profiler** (1.0.0)
|
||||||
|
|
||||||
@@ -1451,9 +1447,8 @@ for a complete overview of this release.
|
|||||||
* Fixed `amd-smi monitor`'s reporting of encode and decode information. `VCLOCK` and `DCLOCK` are
|
* Fixed `amd-smi monitor`'s reporting of encode and decode information. `VCLOCK` and `DCLOCK` are
|
||||||
now associated with both `ENC_UTIL` and `DEC_UTIL`.
|
now associated with both `ENC_UTIL` and `DEC_UTIL`.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
||||||
```
|
|
||||||
|
|
||||||
### **HIP** (6.3.1)
|
### **HIP** (6.3.1)
|
||||||
|
|
||||||
@@ -1657,9 +1652,8 @@ for a complete overview of this release.
|
|||||||
- The new partition command can display GPU information, including memory and accelerator partition information.
|
- The new partition command can display GPU information, including memory and accelerator partition information.
|
||||||
- The command will be at full functionality once additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()` has been implemented.
|
- The command will be at full functionality once additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()` has been implemented.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
||||||
```
|
|
||||||
|
|
||||||
### **HIP** (6.3.0)
|
### **HIP** (6.3.0)
|
||||||
|
|
||||||
@@ -1793,18 +1787,17 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG
|
|||||||
|
|
||||||
* Support for `fp8` data types
|
* Support for `fp8` data types
|
||||||
|
|
||||||
### **hipRAND** (2.11.0[*](#id22))
|
### **hipRAND** (2.11.0)
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> In ROCm 6.3.0, the hipRAND package version is incorrectly set to `2.11.0`.
|
||||||
|
> In ROCm 6.2.4, the hipRAND package version was `2.11.1`.
|
||||||
|
> The hipRAND version number will be corrected in a future ROCm release.
|
||||||
|
|
||||||
#### Changed
|
#### Changed
|
||||||
|
|
||||||
* Updated the default value for the `-a` argument from `rmake.py` to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102`.
|
* Updated the default value for the `-a` argument from `rmake.py` to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102`.
|
||||||
|
|
||||||
#### Known issues
|
|
||||||
|
|
||||||
* In ROCm 6.3.0, the hipRAND package version is incorrectly set to `2.11.0`. In ROCm
|
|
||||||
6.2.4, the hipRAND package version was `2.11.1`. The hipRAND version number will be corrected in a
|
|
||||||
future ROCm release.
|
|
||||||
|
|
||||||
#### Resolved issues
|
#### Resolved issues
|
||||||
|
|
||||||
* Fixed an issue in `rmake.py` where the list storing the CMake options would contain individual characters instead of a full string of options.
|
* Fixed an issue in `rmake.py` where the list storing the CMake options would contain individual characters instead of a full string of options.
|
||||||
@@ -2005,7 +1998,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG
|
|||||||
|
|
||||||
#### Known issues
|
#### Known issues
|
||||||
|
|
||||||
* See [MIVisionX memory access fault in Canny edge detection](#mivisionx-memory-access-fault-in-canny-edge-detection).
|
* See [MIVisionX memory access fault in Canny edge detection](https://github.com/ROCm/ROCm/issues/4086).
|
||||||
* Package installation requires the manual installation of OpenCV.
|
* Package installation requires the manual installation of OpenCV.
|
||||||
* Installation on CentOS/RedHat/SLES requires the manual installation of the `FFMPEG Dev` package.
|
* Installation on CentOS/RedHat/SLES requires the manual installation of the `FFMPEG Dev` package.
|
||||||
* Hardware decode requires installation with `--usecase=graphics` in addition to `--usecase=rocm`.
|
* Hardware decode requires installation with `--usecase=graphics` in addition to `--usecase=rocm`.
|
||||||
@@ -2196,9 +2189,9 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG
|
|||||||
|
|
||||||
#### Known issues
|
#### Known issues
|
||||||
|
|
||||||
- See [ROCm Compute Profiler post-upgrade](#rocm-compute-profiler-post-upgrade).
|
- See [ROCm Compute Profiler post-upgrade](https://github.com/ROCm/ROCm/issues/4082).
|
||||||
|
|
||||||
- See [ROCm Compute Profiler CTest failure in CI](#rocm-compute-profiler-ctest-failure-in-ci).
|
- See [ROCm Compute Profiler CTest failure in CI](https://github.com/ROCm/ROCm/issues/4085).
|
||||||
|
|
||||||
### **ROCm Data Center Tool** (0.3.0)
|
### **ROCm Data Center Tool** (0.3.0)
|
||||||
|
|
||||||
@@ -2211,7 +2204,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG
|
|||||||
|
|
||||||
#### Known issues
|
#### Known issues
|
||||||
|
|
||||||
- See [ROCm Data Center Tool incorrect RHEL9 package version](#rocm-data-center-tool-incorrect-rhel9-package-version).
|
- See [ROCm Data Center Tool incorrect RHEL9 package version](https://github.com/ROCm/ROCm/issues/4089).
|
||||||
|
|
||||||
### **ROCm SMI** (7.4.0)
|
### **ROCm SMI** (7.4.0)
|
||||||
|
|
||||||
@@ -2249,9 +2242,8 @@ memory partition modes upon an invalid argument return from memory partition mod
|
|||||||
|
|
||||||
- C++ tests for `memorypartition_read_write` are to be re-enabled in a future ROCm release.
|
- C++ tests for `memorypartition_read_write` are to be re-enabled in a future ROCm release.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.x/CHANGELOG.md) for more details and examples.
|
||||||
```
|
|
||||||
|
|
||||||
### **ROCm Systems Profiler** (0.1.0)
|
### **ROCm Systems Profiler** (0.1.0)
|
||||||
|
|
||||||
@@ -2265,7 +2257,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.
|
|||||||
|
|
||||||
#### Known issues
|
#### Known issues
|
||||||
|
|
||||||
- See [ROCm Systems Profiler post-upgrade](#rocm-systems-profiler-post-upgrade).
|
- See [ROCm Systems Profiler post-upgrade](https://github.com/ROCm/ROCm/issues/4083).
|
||||||
|
|
||||||
### **ROCm Validation Suite** (1.1.0)
|
### **ROCm Validation Suite** (1.1.0)
|
||||||
|
|
||||||
@@ -2279,7 +2271,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.
|
|||||||
|
|
||||||
#### Known issues
|
#### Known issues
|
||||||
|
|
||||||
- See [ROCm Validation Suite needs specified configuration file](#rocm-validation-suite-needs-specified-configuration-file).
|
- See [ROCm Validation Suite needs specified configuration file](https://github.com/ROCm/ROCm/issues/4090).
|
||||||
|
|
||||||
### **rocPRIM** (3.3.0)
|
### **rocPRIM** (3.3.0)
|
||||||
|
|
||||||
@@ -3022,10 +3014,8 @@ for a complete overview of this release.
|
|||||||
|
|
||||||
See [issue #3500](https://github.com/ROCm/ROCm/issues/3500) on GitHub.
|
See [issue #3500](https://github.com/ROCm/ROCm/issues/3500) on GitHub.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the [detailed AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/docs/6.2.0/CHANGELOG.md)
|
> See the [detailed AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/docs/6.2.0/CHANGELOG.md) on GitHub for more information.
|
||||||
on GitHub for more information.
|
|
||||||
```
|
|
||||||
|
|
||||||
### **Composable Kernel** (1.1.0)
|
### **Composable Kernel** (1.1.0)
|
||||||
|
|
||||||
@@ -3624,9 +3614,8 @@ The compiler may incorrectly compile a program that uses the
|
|||||||
the function is undefined along some path to the function. For most functions,
|
the function is undefined along some path to the function. For most functions,
|
||||||
uninitialized inputs cause undefined behavior.
|
uninitialized inputs cause undefined behavior.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
The ``-Wall`` compilation flag prompts the compiler to generate a warning if a variable is uninitialized along some path.
|
> The ``-Wall`` compilation flag prompts the compiler to generate a warning if a variable is uninitialized along some path.
|
||||||
```
|
|
||||||
|
|
||||||
As a workaround, initialize the parameters to ``__shfl``. For example:
|
As a workaround, initialize the parameters to ``__shfl``. For example:
|
||||||
|
|
||||||
@@ -3947,10 +3936,8 @@ See [issue #3498](https://github.com/ROCm/ROCm/issues/3498) on GitHub.
|
|||||||
|
|
||||||
- Fixed Partition ID CLI output.
|
- Fixed Partition ID CLI output.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.2.0/CHANGELOG.md)
|
> See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.2.0/CHANGELOG.md) on GitHub for more information.
|
||||||
on GitHub for more information.
|
|
||||||
```
|
|
||||||
|
|
||||||
### **ROCm Validation Suite** (1.0.0)
|
### **ROCm Validation Suite** (1.0.0)
|
||||||
|
|
||||||
@@ -4320,9 +4307,8 @@ for a complete overview of this release.
|
|||||||
* Fixed the `amdsmitstReadWrite.TestPowerCapReadWrite` test for RDNA3, RDNA2, and MI100 devices.
|
* Fixed the `amdsmitstReadWrite.TestPowerCapReadWrite` test for RDNA3, RDNA2, and MI100 devices.
|
||||||
* Fixed an issue with the `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info` Python interface calls.
|
* Fixed an issue with the `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info` Python interface calls.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.
|
> See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.
|
||||||
```
|
|
||||||
|
|
||||||
### **RCCL** (2.18.6)
|
### **RCCL** (2.18.6)
|
||||||
|
|
||||||
@@ -4402,9 +4388,8 @@ for a complete overview of this release.
|
|||||||
|
|
||||||
- `amd-smi bad-pages` can result in a `ValueError: Null pointer access` error when using some PMU firmware versions.
|
- `amd-smi bad-pages` can result in a `ValueError: Null pointer access` error when using some PMU firmware versions.
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
|
> See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
|
||||||
```
|
|
||||||
|
|
||||||
### **hipBLASLt** (0.7.0)
|
### **hipBLASLt** (0.7.0)
|
||||||
|
|
||||||
@@ -4473,9 +4458,8 @@ See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHAN
|
|||||||
|
|
||||||
- ROCm SMI reports GPU utilization incorrectly for RDNA3 GPUs in some situations. See the issue on [GitHub](https://github.com/ROCm/ROCm/issues/3112).
|
- ROCm SMI reports GPU utilization incorrectly for RDNA3 GPUs in some situations. See the issue on [GitHub](https://github.com/ROCm/ROCm/issues/3112).
|
||||||
|
|
||||||
```{note}
|
> [!NOTE]
|
||||||
See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
|
> See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
|
||||||
```
|
|
||||||
|
|
||||||
## ROCm 6.1.0
|
## ROCm 6.1.0
|
||||||
|
|
||||||
@@ -5192,16 +5176,16 @@ on GitHub for a complete overview of this release.
|
|||||||
|
|
||||||
### **rocSPARSE** (2.5.4)
|
### **rocSPARSE** (2.5.4)
|
||||||
|
|
||||||
##### Added
|
#### Added
|
||||||
|
|
||||||
- Added more mixed precisions for SpMV, (matrix: float, vectors: double, calculation: double) and (matrix: rocsparse_float_complex, vectors: rocsparse_double_complex, calculation: rocsparse_double_complex)
|
- Added more mixed precisions for SpMV, (matrix: float, vectors: double, calculation: double) and (matrix: rocsparse_float_complex, vectors: rocsparse_double_complex, calculation: rocsparse_double_complex)
|
||||||
- Added support for gfx940, gfx941 and gfx942
|
- Added support for gfx940, gfx941 and gfx942
|
||||||
|
|
||||||
##### Optimized
|
#### Optimized
|
||||||
|
|
||||||
- Fixed a bug in csrsm and bsrsm
|
- Fixed a bug in csrsm and bsrsm
|
||||||
|
|
||||||
##### Known issues
|
#### Known issues
|
||||||
|
|
||||||
In csritlu0, the algorithm rocsparse_itilu0_alg_sync_split_fusion has some accuracy issues to investigate with XNACK enabled. The fallback is rocsparse_itilu0_alg_sync_split.
|
In csritlu0, the algorithm rocsparse_itilu0_alg_sync_split_fusion has some accuracy issues to investigate with XNACK enabled. The fallback is rocsparse_itilu0_alg_sync_split.
|
||||||
|
|
||||||
@@ -5287,7 +5271,7 @@ on GitHub for a complete overview of this release.
|
|||||||
|
|
||||||
### **HIP** (5.6.0)
|
### **HIP** (5.6.0)
|
||||||
|
|
||||||
##### Added
|
#### Added
|
||||||
|
|
||||||
- Added hipRTC support for amd_hip_fp16
|
- Added hipRTC support for amd_hip_fp16
|
||||||
- Added hipStreamGetDevice implementation to get the device associated with the stream
|
- Added hipStreamGetDevice implementation to get the device associated with the stream
|
||||||
@@ -5296,7 +5280,7 @@ on GitHub for a complete overview of this release.
|
|||||||
- hipArrayGetDescriptor for getting 1D or 2D array descriptor
|
- hipArrayGetDescriptor for getting 1D or 2D array descriptor
|
||||||
- hipArray3DGetDescriptor to get 3D array descriptor
|
- hipArray3DGetDescriptor to get 3D array descriptor
|
||||||
|
|
||||||
##### Changed
|
#### Changed
|
||||||
|
|
||||||
- hipMallocAsync to return success for zero size allocation to match hipMalloc
|
- hipMallocAsync to return success for zero size allocation to match hipMalloc
|
||||||
- Separation of hipcc perl binaries from HIP project to hipcc project. hip-devel package depends on newly added hipcc package
|
- Separation of hipcc perl binaries from HIP project to hipcc project. hip-devel package depends on newly added hipcc package
|
||||||
@@ -5601,15 +5585,15 @@ $ gcc main.c -I/opt/rocm-5.6.0/include -L/opt/rocm-5.6.0/lib -lrocprofiler64-v2
|
|||||||
The resulting `a.out` will depend on
|
The resulting `a.out` will depend on
|
||||||
`/opt/rocm-5.6.0/lib/librocprofiler64.so.2`.
|
`/opt/rocm-5.6.0/lib/librocprofiler64.so.2`.
|
||||||
|
|
||||||
##### Added
|
#### Added
|
||||||
|
|
||||||
- 'end_time' need to be disabled in roctx_trace.txt
|
- 'end_time' need to be disabled in roctx_trace.txt
|
||||||
|
|
||||||
##### Optimized
|
#### Optimized
|
||||||
|
|
||||||
- Improved Test Suite
|
- Improved Test Suite
|
||||||
|
|
||||||
##### Resolved issues
|
#### Resolved issues
|
||||||
|
|
||||||
- rocprof in ROcm/5.4.0 gpu selector broken.
|
- rocprof in ROcm/5.4.0 gpu selector broken.
|
||||||
- rocprof in ROCm/5.4.1 fails to generate kernel info.
|
- rocprof in ROCm/5.4.1 fails to generate kernel info.
|
||||||
|
|||||||
@@ -30,6 +30,9 @@ ROCm Version,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,
|
|||||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||||
|
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
|
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`,2.4.0,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
|
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||||
,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,
|
||||||
,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,
|
||||||
|
|||||||
|
@@ -54,8 +54,8 @@ compatibility and system requirements.
|
|||||||
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
|
||||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
|
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
|
||||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
|
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
|
||||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
|
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
|
||||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,`85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
|
||||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`,2.4.0,2.4.0,N/A
|
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`,2.4.0,2.4.0,N/A
|
||||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
|
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
|
||||||
,,,
|
,,,
|
||||||
|
|||||||
@@ -10,10 +10,10 @@
|
|||||||
PyTorch compatibility
|
PyTorch compatibility
|
||||||
********************************************************************************
|
********************************************************************************
|
||||||
|
|
||||||
`PyTorch <https://pytorch.org/>`_ is an open-source tensor library designed for
|
`PyTorch <https://pytorch.org/>`__ is an open-source tensor library designed for
|
||||||
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
|
||||||
using `MIOpen <https://github.com/ROCm/MIOpen>`_ and
|
using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
|
||||||
`RCCL <https://github.com/ROCm/rccl>`_ libraries.
|
`RCCL <https://github.com/ROCm/rccl>`__ libraries.
|
||||||
|
|
||||||
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
|
ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
|
||||||
to independent compatibility considerations, this results in two distinct
|
to independent compatibility considerations, this results in two distinct
|
||||||
@@ -27,7 +27,7 @@ release cycles for PyTorch on ROCm:
|
|||||||
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
- Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
|
||||||
preinstalled.
|
preinstalled.
|
||||||
|
|
||||||
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`_
|
- ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
|
||||||
|
|
||||||
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
- See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
||||||
to get started.
|
to get started.
|
||||||
@@ -37,10 +37,10 @@ release cycles for PyTorch on ROCm:
|
|||||||
- Provides the latest stable version of PyTorch but might not necessarily
|
- Provides the latest stable version of PyTorch but might not necessarily
|
||||||
support the latest ROCm version.
|
support the latest ROCm version.
|
||||||
|
|
||||||
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`_
|
- Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
|
||||||
|
|
||||||
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`_
|
- See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
|
||||||
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`_
|
or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
|
||||||
to get started.
|
to get started.
|
||||||
|
|
||||||
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
PyTorch includes tooling that generates HIP source code from the CUDA backend.
|
||||||
@@ -82,7 +82,7 @@ Use cases and recommendations
|
|||||||
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
|
||||||
GPU hardware for training and inference tasks in AI applications.
|
GPU hardware for training and inference tasks in AI applications.
|
||||||
|
|
||||||
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`_.
|
For more use cases and recommendations, see `ROCm PyTorch blog posts <https://rocm.blogs.amd.com/blog/tag/pytorch.html>`__.
|
||||||
|
|
||||||
.. _pytorch-docker-compat:
|
.. _pytorch-docker-compat:
|
||||||
|
|
||||||
@@ -93,9 +93,9 @@ Docker image compatibility
|
|||||||
|
|
||||||
<i class="fab fa-docker"></i>
|
<i class="fab fa-docker"></i>
|
||||||
|
|
||||||
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
|
AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
|
||||||
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||||
inventories were tested on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_.
|
inventories were tested on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||||
Click |docker-icon| to view the image on Docker Hub.
|
Click |docker-icon| to view the image on Docker Hub.
|
||||||
|
|
||||||
.. list-table:: PyTorch Docker image components
|
.. list-table:: PyTorch Docker image components
|
||||||
@@ -118,121 +118,121 @@ Click |docker-icon| to view the image on Docker Hub.
|
|||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-c76af9bfb1c25b0f40d4c29e8652105c57250bf018d23ff595b06bd79666fdd7"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-c76af9bfb1c25b0f40d4c29e8652105c57250bf018d23ff595b06bd79666fdd7"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
|
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||||
- 24.04
|
- 24.04
|
||||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
|
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
|
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.16.0 <https://github.com/openucx/ucx/tree/v1.16.0>`_
|
- `1.16.0 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
|
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-f9d226135d51831c810dcb1251636ec61f85c65fcdda03e188c053a5d4f6585b"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-f9d226135d51831c810dcb1251636ec61f85c65fcdda03e188c053a5d4f6585b"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
|
- `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
|
||||||
- 22.04
|
- 22.04
|
||||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
|
- `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
|
||||||
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
|
- `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
|
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
|
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-3490e74d4f43dcdb3351dd334108d1ccd47e5a687c0523a2424ac1bcdd3dd6dd"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-3490e74d4f43dcdb3351dd334108d1ccd47e5a687c0523a2424ac1bcdd3dd6dd"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
|
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||||
- 24.04
|
- 24.04
|
||||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
|
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
|
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`_
|
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
|
||||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
|
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-26c5dfffb4a54625884abca83166940f17dd27bc75f1b24f6e80fbcb7d4e9afb"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-26c5dfffb4a54625884abca83166940f17dd27bc75f1b24f6e80fbcb7d4e9afb"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
|
- `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||||
- 22.04
|
- 22.04
|
||||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
|
- `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
|
||||||
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
|
- `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
|
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
|
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-f378a24561fa6efc178b6dc93fc7d82e5b93653ecd59c89d4476674d29e1284d"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-f378a24561fa6efc178b6dc93fc7d82e5b93653ecd59c89d4476674d29e1284d"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||||
- 24.04
|
- 24.04
|
||||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
|
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
|
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-2308dbd0e650b7bf8d548575cbb6e2bdc021f9386384ce570da16d58ee684d22"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-2308dbd0e650b7bf8d548575cbb6e2bdc021f9386384ce570da16d58ee684d22"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
- `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
|
||||||
- 22.04
|
- 22.04
|
||||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
|
- `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
|
||||||
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
|
- `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
|
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
|
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-eefd2ab019728f91f94c5e6a9463cb0ea900b3011458d18fe5d88e50c0b57d86"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-eefd2ab019728f91f94c5e6a9463cb0ea900b3011458d18fe5d88e50c0b57d86"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
|
||||||
- 24.04
|
- 24.04
|
||||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
|
||||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
|
- `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
|
||||||
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
|
- `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-473643226ab0e93a04720b256ed772619878abf9c42b9f84828cefed522696fd"><i class="fab fa-docker fa-lg"></i></a>
|
<a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-473643226ab0e93a04720b256ed772619878abf9c42b9f84828cefed522696fd"><i class="fab fa-docker fa-lg"></i></a>
|
||||||
|
|
||||||
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
|
- `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
|
||||||
- 22.04
|
- 22.04
|
||||||
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
|
- `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
|
||||||
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
|
- `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
|
||||||
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
|
- `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
|
||||||
- `master <https://bitbucket.org/icl/magma/src/master/>`_
|
- `master <https://bitbucket.org/icl/magma/src/master/>`__
|
||||||
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
|
- `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
|
||||||
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
|
- `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
|
||||||
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
|
- `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`__
|
||||||
|
|
||||||
Key ROCm libraries for PyTorch
|
Key ROCm libraries for PyTorch
|
||||||
================================================================================
|
================================================================================
|
||||||
@@ -248,121 +248,121 @@ feature set available to developers.
|
|||||||
- Version
|
- Version
|
||||||
- Purpose
|
- Purpose
|
||||||
- Used in
|
- Used in
|
||||||
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
|
* - `Composable Kernel <https://github.com/ROCm/composable_kernel>`__
|
||||||
- :version-ref:`"Composable Kernel" rocm_version`
|
- :version-ref:`"Composable Kernel" rocm_version`
|
||||||
- Enables faster execution of core operations like matrix multiplication
|
- Enables faster execution of core operations like matrix multiplication
|
||||||
(GEMM), convolutions and transformations.
|
(GEMM), convolutions and transformations.
|
||||||
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
- Speeds up ``torch.permute``, ``torch.view``, ``torch.matmul``,
|
||||||
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
``torch.mm``, ``torch.bmm``, ``torch.nn.Conv2d``, ``torch.nn.Conv3d``
|
||||||
and ``torch.nn.MultiheadAttention``.
|
and ``torch.nn.MultiheadAttention``.
|
||||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||||
- :version-ref:`hipBLAS rocm_version`
|
- :version-ref:`hipBLAS rocm_version`
|
||||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||||
matrix and vector operations.
|
matrix and vector operations.
|
||||||
- Supports operations such as matrix multiplication, matrix-vector
|
- Supports operations such as matrix multiplication, matrix-vector
|
||||||
products, and tensor contractions. Utilized in both dense and batched
|
products, and tensor contractions. Utilized in both dense and batched
|
||||||
linear algebra operations.
|
linear algebra operations.
|
||||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
- :version-ref:`hipBLASLt rocm_version`
|
- :version-ref:`hipBLASLt rocm_version`
|
||||||
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||||
features like epilogues fused into the matrix multiplication kernel or
|
features like epilogues fused into the matrix multiplication kernel or
|
||||||
use of integer tensor cores.
|
use of integer tensor cores.
|
||||||
- Accelerates operations such as ``torch.matmul``, ``torch.mm``, and the
|
- Accelerates operations such as ``torch.matmul``, ``torch.mm``, and the
|
||||||
matrix multiplications used in convolutional and linear layers.
|
matrix multiplications used in convolutional and linear layers.
|
||||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||||
- :version-ref:`hipCUB rocm_version`
|
- :version-ref:`hipCUB rocm_version`
|
||||||
- Provides a C++ template library for parallel algorithms for reduction,
|
- Provides a C++ template library for parallel algorithms for reduction,
|
||||||
scan, sort and select.
|
scan, sort and select.
|
||||||
- Supports operations such as ``torch.sum``, ``torch.cumsum``,
|
- Supports operations such as ``torch.sum``, ``torch.cumsum``,
|
||||||
``torch.sort`` irregular shapes often involve scanning, sorting, and
|
``torch.sort`` irregular shapes often involve scanning, sorting, and
|
||||||
filtering, which hipCUB handles efficiently.
|
filtering, which hipCUB handles efficiently.
|
||||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||||
- :version-ref:`hipFFT rocm_version`
|
- :version-ref:`hipFFT rocm_version`
|
||||||
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
- Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
|
||||||
- Used in functions like the ``torch.fft`` module.
|
- Used in functions like the ``torch.fft`` module.
|
||||||
* - `hipRAND <https://github.com/ROCm/hipRAND>`_
|
* - `hipRAND <https://github.com/ROCm/hipRAND>`__
|
||||||
- :version-ref:`hipRAND rocm_version`
|
- :version-ref:`hipRAND rocm_version`
|
||||||
- Provides fast random number generation for GPUs.
|
- Provides fast random number generation for GPUs.
|
||||||
- The ``torch.rand``, ``torch.randn``, and stochastic layers like
|
- The ``torch.rand``, ``torch.randn``, and stochastic layers like
|
||||||
``torch.nn.Dropout`` rely on hipRAND.
|
``torch.nn.Dropout`` rely on hipRAND.
|
||||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||||
- :version-ref:`hipSOLVER rocm_version`
|
- :version-ref:`hipSOLVER rocm_version`
|
||||||
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
- Provides GPU-accelerated solvers for linear systems, eigenvalues, and
|
||||||
singular value decompositions (SVD).
|
singular value decompositions (SVD).
|
||||||
- Supports functions like ``torch.linalg.solve``,
|
- Supports functions like ``torch.linalg.solve``,
|
||||||
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
``torch.linalg.eig``, and ``torch.linalg.svd``.
|
||||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||||
- :version-ref:`hipSPARSE rocm_version`
|
- :version-ref:`hipSPARSE rocm_version`
|
||||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||||
or matrix-matrix products.
|
or matrix-matrix products.
|
||||||
- Sparse tensor operations ``torch.sparse``.
|
- Sparse tensor operations ``torch.sparse``.
|
||||||
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
|
* - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`__
|
||||||
- :version-ref:`hipSPARSELt rocm_version`
|
- :version-ref:`hipSPARSELt rocm_version`
|
||||||
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
- Accelerates operations on sparse matrices, such as sparse matrix-vector
|
||||||
or matrix-matrix products.
|
or matrix-matrix products.
|
||||||
- Sparse tensor operations ``torch.sparse``.
|
- Sparse tensor operations ``torch.sparse``.
|
||||||
* - `hipTensor <https://github.com/ROCm/hipTensor>`_
|
* - `hipTensor <https://github.com/ROCm/hipTensor>`__
|
||||||
- :version-ref:`hipTensor rocm_version`
|
- :version-ref:`hipTensor rocm_version`
|
||||||
- Optimizes for high-performance tensor operations, such as contractions.
|
- Optimizes for high-performance tensor operations, such as contractions.
|
||||||
- Accelerates tensor algebra, especially in deep learning and scientific
|
- Accelerates tensor algebra, especially in deep learning and scientific
|
||||||
computing.
|
computing.
|
||||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||||
- :version-ref:`MIOpen rocm_version`
|
- :version-ref:`MIOpen rocm_version`
|
||||||
- Optimizes deep learning primitives such as convolutions, pooling,
|
- Optimizes deep learning primitives such as convolutions, pooling,
|
||||||
normalization, and activation functions.
|
normalization, and activation functions.
|
||||||
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
- Speeds up convolutional neural networks (CNNs), recurrent neural
|
||||||
networks (RNNs), and other layers. Used in operations like
|
networks (RNNs), and other layers. Used in operations like
|
||||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||||
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
|
* - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`__
|
||||||
- :version-ref:`MIGraphX rocm_version`
|
- :version-ref:`MIGraphX rocm_version`
|
||||||
- Adds graph-level optimizations, ONNX models and mixed precision support
|
- Adds graph-level optimizations, ONNX models and mixed precision support
|
||||||
and enable Ahead-of-Time (AOT) Compilation.
|
and enable Ahead-of-Time (AOT) Compilation.
|
||||||
- Speeds up inference models and executes ONNX models for
|
- Speeds up inference models and executes ONNX models for
|
||||||
compatibility with other frameworks.
|
compatibility with other frameworks.
|
||||||
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
``torch.nn.Conv2d``, ``torch.nn.ReLU``, and ``torch.nn.LSTM``.
|
||||||
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
|
* - `MIVisionX <https://github.com/ROCm/MIVisionX>`__
|
||||||
- :version-ref:`MIVisionX rocm_version`
|
- :version-ref:`MIVisionX rocm_version`
|
||||||
- Optimizes acceleration for computer vision and AI workloads like
|
- Optimizes acceleration for computer vision and AI workloads like
|
||||||
preprocessing, augmentation, and inferencing.
|
preprocessing, augmentation, and inferencing.
|
||||||
- Faster data preprocessing and augmentation pipelines for datasets like
|
- Faster data preprocessing and augmentation pipelines for datasets like
|
||||||
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
ImageNet or COCO and easy to integrate into PyTorch's ``torch.utils.data``
|
||||||
and ``torchvision`` workflows.
|
and ``torchvision`` workflows.
|
||||||
* - `rocAL <https://github.com/ROCm/rocAL>`_
|
* - `rocAL <https://github.com/ROCm/rocAL>`__
|
||||||
- :version-ref:`rocAL rocm_version`
|
- :version-ref:`rocAL rocm_version`
|
||||||
- Accelerates the data pipeline by offloading intensive preprocessing and
|
- Accelerates the data pipeline by offloading intensive preprocessing and
|
||||||
augmentation tasks. rocAL is part of MIVisionX.
|
augmentation tasks. rocAL is part of MIVisionX.
|
||||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||||
``torchvision`` data load workloads.
|
``torchvision`` data load workloads.
|
||||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||||
- :version-ref:`RCCL rocm_version`
|
- :version-ref:`RCCL rocm_version`
|
||||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||||
Broadcast.
|
Broadcast.
|
||||||
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
- Distributed data parallel training (``torch.nn.parallel.DistributedDataParallel``).
|
||||||
Handles communication in multi-GPU setups.
|
Handles communication in multi-GPU setups.
|
||||||
* - `rocDecode <https://github.com/ROCm/rocDecode>`_
|
* - `rocDecode <https://github.com/ROCm/rocDecode>`__
|
||||||
- :version-ref:`rocDecode rocm_version`
|
- :version-ref:`rocDecode rocm_version`
|
||||||
- Provides hardware-accelerated data decoding capabilities, particularly
|
- Provides hardware-accelerated data decoding capabilities, particularly
|
||||||
for image, video, and other dataset formats.
|
for image, video, and other dataset formats.
|
||||||
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
- Can be integrated in ``torch.utils.data``, ``torchvision.transforms``
|
||||||
and ``torch.distributed``.
|
and ``torch.distributed``.
|
||||||
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
|
* - `rocJPEG <https://github.com/ROCm/rocJPEG>`__
|
||||||
- :version-ref:`rocJPEG rocm_version`
|
- :version-ref:`rocJPEG rocm_version`
|
||||||
- Provides hardware-accelerated JPEG image decoding and encoding.
|
- Provides hardware-accelerated JPEG image decoding and encoding.
|
||||||
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
- GPU accelerated ``torchvision.io.decode_jpeg`` and
|
||||||
``torchvision.io.encode_jpeg`` and can be integrated in
|
``torchvision.io.encode_jpeg`` and can be integrated in
|
||||||
``torch.utils.data`` and ``torchvision``.
|
``torch.utils.data`` and ``torchvision``.
|
||||||
* - `RPP <https://github.com/ROCm/RPP>`_
|
* - `RPP <https://github.com/ROCm/RPP>`__
|
||||||
- :version-ref:`RPP rocm_version`
|
- :version-ref:`RPP rocm_version`
|
||||||
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
- Speeds up data augmentation, transformation, and other preprocessing steps.
|
||||||
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
- Easy to integrate into PyTorch's ``torch.utils.data`` and
|
||||||
``torchvision`` data load workloads to speed up data processing.
|
``torchvision`` data load workloads to speed up data processing.
|
||||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||||
- :version-ref:`rocThrust rocm_version`
|
- :version-ref:`rocThrust rocm_version`
|
||||||
- Provides a C++ template library for parallel algorithms like sorting,
|
- Provides a C++ template library for parallel algorithms like sorting,
|
||||||
reduction, and scanning.
|
reduction, and scanning.
|
||||||
- Utilized in backend operations for tensor computations requiring
|
- Utilized in backend operations for tensor computations requiring
|
||||||
parallel processing.
|
parallel processing.
|
||||||
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||||
- :version-ref:`rocWMMA rocm_version`
|
- :version-ref:`rocWMMA rocm_version`
|
||||||
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||||
multiplication (GEMM) and accumulation operations with mixed precision
|
multiplication (GEMM) and accumulation operations with mixed precision
|
||||||
@@ -383,7 +383,7 @@ Supported data types
|
|||||||
The tensor data type is specified using the ``dtype`` attribute or argument.
|
The tensor data type is specified using the ``dtype`` attribute or argument.
|
||||||
PyTorch supports many data types for different use cases.
|
PyTorch supports many data types for different use cases.
|
||||||
|
|
||||||
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_
|
The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`__
|
||||||
single data types:
|
single data types:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
|
|||||||
@@ -10,16 +10,16 @@
|
|||||||
TensorFlow compatibility
|
TensorFlow compatibility
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
|
|
||||||
`TensorFlow <https://www.tensorflow.org/>`_ is an open-source library for
|
`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
|
||||||
solving machine learning, deep learning, and AI problems. It can solve many
|
solving machine learning, deep learning, and AI problems. It can solve many
|
||||||
problems across different sectors and industries but primarily focuses on
|
problems across different sectors and industries but primarily focuses on
|
||||||
neural network training and inference. It is one of the most popular and
|
neural network training and inference. It is one of the most popular and
|
||||||
in-demand frameworks and is very active in open-source contribution and
|
in-demand frameworks and is very active in open-source contribution and
|
||||||
development.
|
development.
|
||||||
|
|
||||||
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`_
|
The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
|
||||||
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
|
||||||
<http://github.com/rocm/tensorflow-upstream>`_ in order to quickly add bug
|
<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
|
||||||
fixes, updates, and support for the latest ROCM versions.
|
fixes, updates, and support for the latest ROCM versions.
|
||||||
|
|
||||||
- ROCm TensorFlow release:
|
- ROCm TensorFlow release:
|
||||||
@@ -27,16 +27,16 @@ fixes, updates, and support for the latest ROCM versions.
|
|||||||
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
|
- Offers :ref:`Docker images <tensorflow-docker-compat>` with
|
||||||
ROCm and TensorFlow pre-installed.
|
ROCm and TensorFlow pre-installed.
|
||||||
|
|
||||||
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`_
|
- ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
|
||||||
|
|
||||||
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
||||||
to get started.
|
to get started.
|
||||||
|
|
||||||
- Official TensorFlow release:
|
- Official TensorFlow release:
|
||||||
|
|
||||||
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`_
|
- Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
|
||||||
|
|
||||||
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`_ list.
|
- See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -54,9 +54,9 @@ Docker image compatibility
|
|||||||
<i class="fab fa-docker"></i>
|
<i class="fab fa-docker"></i>
|
||||||
|
|
||||||
AMD validates and publishes ready-made `TensorFlow images
|
AMD validates and publishes ready-made `TensorFlow images
|
||||||
<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
|
<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
|
||||||
Docker Hub. The following Docker image tags and associated inventories are
|
Docker Hub. The following Docker image tags and associated inventories are
|
||||||
validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click
|
validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__. Click
|
||||||
the |docker-icon| icon to view the image on Docker Hub.
|
the |docker-icon| icon to view the image on Docker Hub.
|
||||||
|
|
||||||
.. list-table:: TensorFlow Docker image components
|
.. list-table:: TensorFlow Docker image components
|
||||||
@@ -76,8 +76,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- dev
|
- dev
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
|
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -86,8 +86,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
|
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -96,8 +96,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||||
- dev
|
- dev
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
|
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -106,8 +106,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
|
- `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -116,8 +116,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- dev
|
- dev
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -126,8 +126,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`__
|
||||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -136,8 +136,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- dev
|
- dev
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -146,8 +146,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
|
- `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -156,8 +156,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- dev
|
- dev
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -166,8 +166,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 24.04
|
- 24.04
|
||||||
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
- `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -176,8 +176,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
|
- `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
|
||||||
- dev
|
- dev
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||||
|
|
||||||
* - .. raw:: html
|
* - .. raw:: html
|
||||||
|
|
||||||
@@ -186,8 +186,8 @@ the |docker-icon| icon to view the image on Docker Hub.
|
|||||||
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
- `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
|
||||||
- runtime
|
- runtime
|
||||||
- 22.04
|
- 22.04
|
||||||
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
|
- `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`__
|
||||||
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
|
- `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
|
||||||
|
|
||||||
|
|
||||||
Critical ROCm libraries for TensorFlow
|
Critical ROCm libraries for TensorFlow
|
||||||
@@ -207,43 +207,43 @@ are available in ROCm :version:`rocm_version`.
|
|||||||
- Version
|
- Version
|
||||||
- Purpose
|
- Purpose
|
||||||
- Used in
|
- Used in
|
||||||
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
|
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||||
- :version-ref:`hipBLAS rocm_version`
|
- :version-ref:`hipBLAS rocm_version`
|
||||||
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||||
matrix and vector operations.
|
matrix and vector operations.
|
||||||
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
- Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
|
||||||
other matrix multiplications commonly used in neural network layers.
|
other matrix multiplications commonly used in neural network layers.
|
||||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
- :version-ref:`hipBLASLt rocm_version`
|
- :version-ref:`hipBLASLt rocm_version`
|
||||||
- Extends hipBLAS with additional optimizations like fused kernels and
|
- Extends hipBLAS with additional optimizations like fused kernels and
|
||||||
integer tensor cores.
|
integer tensor cores.
|
||||||
- Optimizes matrix multiplications and linear algebra operations used in
|
- Optimizes matrix multiplications and linear algebra operations used in
|
||||||
layers like dense, convolutional, and RNNs in TensorFlow.
|
layers like dense, convolutional, and RNNs in TensorFlow.
|
||||||
* - `hipCUB <https://github.com/ROCm/hipCUB>`_
|
* - `hipCUB <https://github.com/ROCm/hipCUB>`__
|
||||||
- :version-ref:`hipCUB rocm_version`
|
- :version-ref:`hipCUB rocm_version`
|
||||||
- Provides a C++ template library for parallel algorithms for reduction,
|
- Provides a C++ template library for parallel algorithms for reduction,
|
||||||
scan, sort and select.
|
scan, sort and select.
|
||||||
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
- Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
|
||||||
and other tensor operations in TensorFlow, especially those involving
|
and other tensor operations in TensorFlow, especially those involving
|
||||||
scanning, sorting, and filtering.
|
scanning, sorting, and filtering.
|
||||||
* - `hipFFT <https://github.com/ROCm/hipFFT>`_
|
* - `hipFFT <https://github.com/ROCm/hipFFT>`__
|
||||||
- :version-ref:`hipFFT rocm_version`
|
- :version-ref:`hipFFT rocm_version`
|
||||||
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
- Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
|
||||||
- Used for operations like signal processing, image filtering, and
|
- Used for operations like signal processing, image filtering, and
|
||||||
certain types of neural networks requiring FFT-based transformations.
|
certain types of neural networks requiring FFT-based transformations.
|
||||||
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
|
* - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
|
||||||
- :version-ref:`hipSOLVER rocm_version`
|
- :version-ref:`hipSOLVER rocm_version`
|
||||||
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
- Provides GPU-accelerated direct linear solvers for dense and sparse
|
||||||
systems.
|
systems.
|
||||||
- Optimizes linear algebra functions such as solving systems of linear
|
- Optimizes linear algebra functions such as solving systems of linear
|
||||||
equations, often used in optimization and training tasks.
|
equations, often used in optimization and training tasks.
|
||||||
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
|
* - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
|
||||||
- :version-ref:`hipSPARSE rocm_version`
|
- :version-ref:`hipSPARSE rocm_version`
|
||||||
- Optimizes sparse matrix operations for efficient computations on sparse
|
- Optimizes sparse matrix operations for efficient computations on sparse
|
||||||
data.
|
data.
|
||||||
- Accelerates sparse matrix operations in models with sparse weight
|
- Accelerates sparse matrix operations in models with sparse weight
|
||||||
matrices or activations, commonly used in neural networks.
|
matrices or activations, commonly used in neural networks.
|
||||||
* - `MIOpen <https://github.com/ROCm/MIOpen>`_
|
* - `MIOpen <https://github.com/ROCm/MIOpen>`__
|
||||||
- :version-ref:`MIOpen rocm_version`
|
- :version-ref:`MIOpen rocm_version`
|
||||||
- Provides optimized deep learning primitives such as convolutions,
|
- Provides optimized deep learning primitives such as convolutions,
|
||||||
pooling,
|
pooling,
|
||||||
@@ -251,13 +251,13 @@ are available in ROCm :version:`rocm_version`.
|
|||||||
- Speeds up convolutional neural networks (CNNs) and other layers. Used
|
- Speeds up convolutional neural networks (CNNs) and other layers. Used
|
||||||
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
|
||||||
``tf.nn.lstm_cell``.
|
``tf.nn.lstm_cell``.
|
||||||
* - `RCCL <https://github.com/ROCm/rccl>`_
|
* - `RCCL <https://github.com/ROCm/rccl>`__
|
||||||
- :version-ref:`RCCL rocm_version`
|
- :version-ref:`RCCL rocm_version`
|
||||||
- Optimizes for multi-GPU communication for operations like AllReduce and
|
- Optimizes for multi-GPU communication for operations like AllReduce and
|
||||||
Broadcast.
|
Broadcast.
|
||||||
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
- Distributed data parallel training (``tf.distribute.MirroredStrategy``).
|
||||||
Handles communication in multi-GPU setups.
|
Handles communication in multi-GPU setups.
|
||||||
* - `rocThrust <https://github.com/ROCm/rocThrust>`_
|
* - `rocThrust <https://github.com/ROCm/rocThrust>`__
|
||||||
- :version-ref:`rocThrust rocm_version`
|
- :version-ref:`rocThrust rocm_version`
|
||||||
- Provides a C++ template library for parallel algorithms like sorting,
|
- Provides a C++ template library for parallel algorithms like sorting,
|
||||||
reduction, and scanning.
|
reduction, and scanning.
|
||||||
@@ -278,7 +278,7 @@ The data type of a tensor is specified using the ``dtype`` attribute or
|
|||||||
argument, and TensorFlow supports a wide range of data types for different use
|
argument, and TensorFlow supports a wide range of data types for different use
|
||||||
cases.
|
cases.
|
||||||
|
|
||||||
The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`_
|
The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`__
|
||||||
are as follows:
|
are as follows:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
@@ -550,7 +550,7 @@ Use cases and recommendations
|
|||||||
===============================================================================
|
===============================================================================
|
||||||
|
|
||||||
* The `Training a Neural Collaborative Filtering (NCF) Recommender on an AMD
|
* The `Training a Neural Collaborative Filtering (NCF) Recommender on an AMD
|
||||||
GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`_
|
GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`__
|
||||||
blog post discusses training an NCF recommender system using TensorFlow. It
|
blog post discusses training an NCF recommender system using TensorFlow. It
|
||||||
explains how NCF improves traditional collaborative filtering methods by
|
explains how NCF improves traditional collaborative filtering methods by
|
||||||
leveraging neural networks to model non-linear user-item interactions. The
|
leveraging neural networks to model non-linear user-item interactions. The
|
||||||
@@ -559,7 +559,7 @@ Use cases and recommendations
|
|||||||
purchasing) and how it addresses challenges like the lack of negative values.
|
purchasing) and how it addresses challenges like the lack of negative values.
|
||||||
|
|
||||||
* The `Creating a PyTorch/TensorFlow code environment on AMD GPUs
|
* The `Creating a PyTorch/TensorFlow code environment on AMD GPUs
|
||||||
<https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`_
|
<https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`__
|
||||||
blog post provides instructions for creating a machine learning environment
|
blog post provides instructions for creating a machine learning environment
|
||||||
for PyTorch and TensorFlow on AMD GPUs using ROCm. It covers steps like
|
for PyTorch and TensorFlow on AMD GPUs using ROCm. It covers steps like
|
||||||
installing the libraries, cloning code repositories, installing dependencies,
|
installing the libraries, cloning code repositories, installing dependencies,
|
||||||
@@ -568,4 +568,4 @@ Use cases and recommendations
|
|||||||
for a better experience on AMD GPUs. This guide aims to help data scientists
|
for a better experience on AMD GPUs. This guide aims to help data scientists
|
||||||
and ML practitioners adapt their code for AMD GPUs.
|
and ML practitioners adapt their code for AMD GPUs.
|
||||||
|
|
||||||
For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`_.
|
For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`__.
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ verl offers a scalable, open-source fine-tuning solution optimized for AMD Insti
|
|||||||
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
|
* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl.
|
||||||
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
|
* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
|
||||||
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
|
* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled.
|
||||||
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` to get started.
|
* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to get started.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics
|
|||||||
|
|
||||||
This document lists and describes the hardware performance counters and derived metrics available
|
This document lists and describes the hardware performance counters and derived metrics available
|
||||||
for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
|
for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
|
||||||
:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
|
:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.
|
||||||
|
|
||||||
MI300 and MI200 series performance counters
|
MI300 and MI200 series performance counters
|
||||||
===============================================================
|
===============================================================
|
||||||
|
|||||||
76
docs/conf.py
76
docs/conf.py
@@ -12,6 +12,54 @@ from pathlib import Path
|
|||||||
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
|
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
|
||||||
shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
|
shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
|
||||||
|
|
||||||
|
# Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
|
||||||
|
with open("./release/changelog.md", "r+") as file:
|
||||||
|
content = file.read()
|
||||||
|
file.seek(0)
|
||||||
|
file.write(":orphan:\n" + content)
|
||||||
|
|
||||||
|
# Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
|
||||||
|
with open("./release/changelog.md", "r") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
modified_lines = []
|
||||||
|
in_admonition_section = False
|
||||||
|
|
||||||
|
# Map for matching the specific admonition type to its corresponding Sphinx markdown syntax
|
||||||
|
admonition_types = {
|
||||||
|
'> [!NOTE]': '```{note}',
|
||||||
|
'> [!TIP]': '```{tip}',
|
||||||
|
'> [!IMPORTANT]': '```{important}',
|
||||||
|
'> [!WARNING]': '```{warning}',
|
||||||
|
'> [!CAUTION]': '```{caution}'
|
||||||
|
}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if any(line.startswith(k) for k in admonition_types):
|
||||||
|
for key in admonition_types:
|
||||||
|
if(line.startswith(key)):
|
||||||
|
modified_lines.append(admonition_types[key] + '\n')
|
||||||
|
break
|
||||||
|
in_admonition_section = True
|
||||||
|
elif in_admonition_section:
|
||||||
|
if line.strip() == '':
|
||||||
|
# If we encounter an empty line, close the admonition section
|
||||||
|
modified_lines.append('```\n\n') # Close the admonition block
|
||||||
|
in_admonition_section = False
|
||||||
|
else:
|
||||||
|
modified_lines.append(line.lstrip('> '))
|
||||||
|
else:
|
||||||
|
modified_lines.append(line)
|
||||||
|
|
||||||
|
# In case the file ended while still in a admonition section, close it
|
||||||
|
if in_admonition_section:
|
||||||
|
modified_lines.append('```')
|
||||||
|
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
with open("./release/changelog.md", 'w') as file:
|
||||||
|
file.writelines(modified_lines)
|
||||||
|
|
||||||
os.system("mkdir -p ../_readthedocs/html/downloads")
|
os.system("mkdir -p ../_readthedocs/html/downloads")
|
||||||
os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
|
os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
|
||||||
|
|
||||||
@@ -57,10 +105,22 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||||
|
|
||||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||||
@@ -72,7 +132,16 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||||
|
|
||||||
@@ -129,6 +198,7 @@ html_theme_options = {"link_main_doc": False}
|
|||||||
redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
|
redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}
|
||||||
|
|
||||||
numfig = False
|
numfig = False
|
||||||
|
suppress_warnings = ["autosectionlabel.*"]
|
||||||
|
|
||||||
html_context = {
|
html_context = {
|
||||||
"project_path" : {project_path},
|
"project_path" : {project_path},
|
||||||
|
|||||||
@@ -0,0 +1,162 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_vllm_llama-2-7b
|
||||||
|
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mistral 7B
|
||||||
|
mad_tag: pyt_vllm_mistral-7b
|
||||||
|
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mistral 7B FP8
|
||||||
|
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||||
|
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen2 7B
|
||||||
|
mad_tag: pyt_vllm_qwen2-7b
|
||||||
|
model_repo: Qwen/Qwen2-7B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen2 72B
|
||||||
|
mad_tag: pyt_vllm_qwen2-72b
|
||||||
|
model_repo: Qwen/Qwen2-72B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
tunableop: true
|
||||||
|
- group: Databricks DBRX
|
||||||
|
tag: dbrx
|
||||||
|
models:
|
||||||
|
- model: DBRX Instruct
|
||||||
|
mad_tag: pyt_vllm_dbrx-instruct
|
||||||
|
model_repo: databricks/dbrx-instruct
|
||||||
|
url: https://huggingface.co/databricks/dbrx-instruct
|
||||||
|
precision: float16
|
||||||
|
- model: DBRX Instruct FP8
|
||||||
|
mad_tag: pyt_vllm_dbrx_fp8
|
||||||
|
model_repo: amd/dbrx-instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Google Gemma
|
||||||
|
tag: gemma
|
||||||
|
models:
|
||||||
|
- model: Gemma 2 27B
|
||||||
|
mad_tag: pyt_vllm_gemma-2-27b
|
||||||
|
model_repo: google/gemma-2-27b
|
||||||
|
url: https://huggingface.co/google/gemma-2-27b
|
||||||
|
precision: float16
|
||||||
|
- group: Cohere
|
||||||
|
tag: cohere
|
||||||
|
models:
|
||||||
|
- model: C4AI Command R+ 08-2024
|
||||||
|
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||||
|
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
precision: float16
|
||||||
|
- model: C4AI Command R+ 08-2024 FP8
|
||||||
|
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||||
|
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek MoE 16B
|
||||||
|
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||||
|
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
- group: TII Falcon
|
||||||
|
tag: falcon
|
||||||
|
models:
|
||||||
|
- model: Falcon 180B
|
||||||
|
mad_tag: pyt_vllm_falcon-180b
|
||||||
|
model_repo: tiiuae/falcon-180B
|
||||||
|
url: https://huggingface.co/tiiuae/falcon-180B
|
||||||
|
precision: float16
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
# TODO: update me
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_vllm_llama-2-7b
|
||||||
|
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mistral 7B
|
||||||
|
mad_tag: pyt_vllm_mistral-7b
|
||||||
|
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mistral 7B FP8
|
||||||
|
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||||
|
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen2 7B
|
||||||
|
mad_tag: pyt_vllm_qwen2-7b
|
||||||
|
model_repo: Qwen/Qwen2-7B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen2 72B
|
||||||
|
mad_tag: pyt_vllm_qwen2-72b
|
||||||
|
model_repo: Qwen/Qwen2-72B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
tunableop: true
|
||||||
|
- group: Databricks DBRX
|
||||||
|
tag: dbrx
|
||||||
|
models:
|
||||||
|
- model: DBRX Instruct
|
||||||
|
mad_tag: pyt_vllm_dbrx-instruct
|
||||||
|
model_repo: databricks/dbrx-instruct
|
||||||
|
url: https://huggingface.co/databricks/dbrx-instruct
|
||||||
|
precision: float16
|
||||||
|
- model: DBRX Instruct FP8
|
||||||
|
mad_tag: pyt_vllm_dbrx_fp8
|
||||||
|
model_repo: amd/dbrx-instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Google Gemma
|
||||||
|
tag: gemma
|
||||||
|
models:
|
||||||
|
- model: Gemma 2 27B
|
||||||
|
mad_tag: pyt_vllm_gemma-2-27b
|
||||||
|
model_repo: google/gemma-2-27b
|
||||||
|
url: https://huggingface.co/google/gemma-2-27b
|
||||||
|
precision: float16
|
||||||
|
- group: Cohere
|
||||||
|
tag: cohere
|
||||||
|
models:
|
||||||
|
- model: C4AI Command R+ 08-2024
|
||||||
|
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||||
|
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
precision: float16
|
||||||
|
- model: C4AI Command R+ 08-2024 FP8
|
||||||
|
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||||
|
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek MoE 16B
|
||||||
|
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||||
|
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
- group: TII Falcon
|
||||||
|
tag: falcon
|
||||||
|
models:
|
||||||
|
- model: Falcon 180B
|
||||||
|
mad_tag: pyt_vllm_falcon-180b
|
||||||
|
model_repo: tiiuae/falcon-180B
|
||||||
|
url: https://huggingface.co/tiiuae/falcon-180B
|
||||||
|
precision: float16
|
||||||
@@ -31,3 +31,11 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: genmo/mochi-1-preview
|
model_repo: genmo/mochi-1-preview
|
||||||
url: https://huggingface.co/genmo/mochi-1-preview
|
url: https://huggingface.co/genmo/mochi-1-preview
|
||||||
precision: float16
|
precision: float16
|
||||||
|
- group: Wan2.1
|
||||||
|
tag: wan
|
||||||
|
models:
|
||||||
|
- model: Wan2.1
|
||||||
|
mad_tag: pyt_wan2.1_inference
|
||||||
|
model_repo: Wan-AI/Wan2.1-T2V-14B
|
||||||
|
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
||||||
|
precision: bfloat16
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
vllm_benchmark:
|
vllm_benchmark:
|
||||||
unified_docker:
|
unified_docker:
|
||||||
latest:
|
latest:
|
||||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
|
# TODO: update me
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
||||||
rocm_version: 6.4.1
|
rocm_version: 6.4.1
|
||||||
vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
|
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
||||||
pytorch_version: 2.7.0+gitf717b2a
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
hipblaslt_version: 0.15
|
hipblaslt_version: 0.15
|
||||||
model_groups:
|
model_groups:
|
||||||
|
|||||||
@@ -1,29 +1,60 @@
|
|||||||
megatron-lm_benchmark:
|
dockers:
|
||||||
model_groups:
|
- pull_tag: rocm/megatron-lm:v25.6_py312
|
||||||
- group: Meta Llama
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
||||||
tag: llama
|
components:
|
||||||
models:
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: 3.12
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 24.04 + Python 3.12
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 22.04 + Python 3.10
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
- model: Llama 3.3 70B
|
- model: Llama 3.3 70B
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
- model: Llama 3.1 8B
|
- model: Llama 3.1 8B
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
- model: Llama 3.1 70B
|
- model: Llama 3.1 70B
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 3.1 70B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
- model: Llama 2 7B
|
- model: Llama 2 7B
|
||||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
- model: Llama 2 70B
|
- model: Llama 2 70B
|
||||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
- group: DeepSeek
|
- group: DeepSeek
|
||||||
tag: deepseek
|
tag: deepseek
|
||||||
models:
|
models:
|
||||||
- model: DeepSeek-V3
|
- model: DeepSeek-V3 (proxy)
|
||||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
- model: DeepSeek-V2-Lite
|
- model: DeepSeek-V2-Lite
|
||||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
- group: Mistral AI
|
- group: Mistral AI
|
||||||
tag: mistral
|
tag: mistral
|
||||||
models:
|
models:
|
||||||
- model: Mixtral 8x7B
|
- model: Mixtral 8x7B
|
||||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
- model: Mixtral 8x22B
|
- model: Mixtral 8x22B (proxy)
|
||||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
megatron-lm_benchmark:
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
- model: Mixtral 8x22B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
|
|||||||
**************************************
|
**************************************
|
||||||
|
|
||||||
The following performance guides provide essential guidance on the necessary
|
The following performance guides provide essential guidance on the necessary
|
||||||
steps to properly :doc:`configure your system for AMD Instinct™ MI300X
|
steps to properly `configure your system for AMD Instinct™ MI300X accelerators
|
||||||
accelerators <../system-optimization/mi300x>`. They include detailed
|
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
instructions on system settings and application :doc:`workload tuning
|
They include detailed instructions on system settings and application
|
||||||
<../rocm-for-ai/inference-optimization/workload>` to help you
|
:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
|
||||||
leverage the maximum capabilities of these accelerators and achieve superior
|
help you leverage the maximum capabilities of these accelerators and achieve
|
||||||
performance.
|
superior performance.
|
||||||
|
|
||||||
* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
|
* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
|
||||||
covers essential system settings and system management practices to configure
|
covers essential system settings and system management practices to configure
|
||||||
your AMD Instinct MI300X system for performance.
|
your AMD Instinct MI300X system for performance.
|
||||||
|
|
||||||
* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
|
* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
|
||||||
optimize the performance of AMD Instinct MI300X series accelerators for HPC
|
optimize the performance of AMD Instinct MI300X series accelerators for HPC
|
||||||
and deep learning operations.
|
and deep learning operations.
|
||||||
|
|
||||||
* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
|
* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
|
||||||
environment for LLM inference, designed to help you test performance with
|
environment for LLM inference, designed to help you test performance with
|
||||||
popular models on AMD Instinct MI300X series accelerators.
|
popular models on AMD Instinct MI300X series accelerators.
|
||||||
|
|||||||
@@ -24,5 +24,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
|||||||
- :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
|
- :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
|
||||||
:doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
|
:doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
|
||||||
:doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
|
:doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
Use ROCm for AI
|
Use ROCm for AI
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
|
ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
|
||||||
|
|
||||||
You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
|
You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
|
||||||
|
|
||||||
|
|||||||
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
|
|||||||
address any new bottlenecks that may emerge.
|
address any new bottlenecks that may emerge.
|
||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image that has everything required to implement
|
ROCm provides a prebuilt optimized Docker image that has everything required to implement
|
||||||
the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV
|
the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
|
||||||
format. For more information, see :doc:`../inference/vllm-benchmark`.
|
For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
|
||||||
|
|
||||||
.. _mi300x-profiling-tools:
|
.. _mi300x-profiling-tools:
|
||||||
|
|
||||||
@@ -343,9 +343,10 @@ The following performance tips are not *specific* to vLLM -- they are general
|
|||||||
but relevant in this context. You can tune the following vLLM parameters to
|
but relevant in this context. You can tune the following vLLM parameters to
|
||||||
achieve optimal request latency and throughput performance.
|
achieve optimal request latency and throughput performance.
|
||||||
|
|
||||||
* As described in :ref:`mi300x-env-vars`, the environment
|
* As described in `Environment variables (MI300X)
|
||||||
variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
|
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
|
||||||
``export HIP_FORCE_DEV_KERNARG=1``.
|
the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
|
||||||
|
performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
|
||||||
|
|
||||||
* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
|
* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
|
||||||
to ``112`` to increase the number of channels on MI300X to potentially improve
|
to ``112`` to increase the number of channels on MI300X to potentially improve
|
||||||
@@ -410,9 +411,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
|
|||||||
usage with ROCm.
|
usage with ROCm.
|
||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image for validating the performance
|
ROCm provides a prebuilt optimized Docker image for validating the performance
|
||||||
of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
|
of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
|
||||||
ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
|
ROCm, vLLM, and PyTorch. For more information, see
|
||||||
see :doc:`../inference/vllm-benchmark`.
|
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
|
||||||
|
|
||||||
.. _mi300x-vllm-throughput-measurement:
|
.. _mi300x-vllm-throughput-measurement:
|
||||||
|
|
||||||
@@ -1477,8 +1478,9 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
|
|||||||
checking whether the output is ``0``.
|
checking whether the output is ``0``.
|
||||||
|
|
||||||
If the output is ``1``, you can disable NUMA auto-balancing by running the
|
If the output is ``1``, you can disable NUMA auto-balancing by running the
|
||||||
following command: ``sudo sysctl kernel.numa_balancing=0``. For more
|
following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
|
||||||
details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
see `AMD Instinct MI300X system optimization
|
||||||
|
<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.
|
||||||
|
|
||||||
.. _mi300x-rccl-disable-acs:
|
.. _mi300x-rccl-disable-acs:
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
|
|
||||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
might hang until the periodic balancing is finalized. For more information,
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -80,11 +80,11 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
Once setup is complete, you can choose between two options to reproduce the
|
Once setup is complete, you can choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v043>`
|
||||||
|
|
||||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
- :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>`
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-v043:
|
||||||
|
|
||||||
MAD-integrated benchmarking
|
MAD-integrated benchmarking
|
||||||
===========================
|
===========================
|
||||||
@@ -112,7 +112,7 @@ model are collected in the following path: ``~/MAD/reports_float16/``
|
|||||||
|
|
||||||
Although the following eight models are pre-configured to collect latency and
|
Although the following eight models are pre-configured to collect latency and
|
||||||
throughput performance data, users can also change the benchmarking parameters.
|
throughput performance data, users can also change the benchmarking parameters.
|
||||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>` section.
|
||||||
|
|
||||||
Available models
|
Available models
|
||||||
----------------
|
----------------
|
||||||
@@ -136,7 +136,7 @@ Available models
|
|||||||
|
|
||||||
* ``pyt_vllm_jais-30b``
|
* ``pyt_vllm_jais-30b``
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone:
|
.. _vllm-benchmark-standalone-v043:
|
||||||
|
|
||||||
Standalone benchmarking
|
Standalone benchmarking
|
||||||
=======================
|
=======================
|
||||||
@@ -167,14 +167,14 @@ Command
|
|||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
To start the benchmark, use the following command with the appropriate options.
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||||
|
|
||||||
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
See the :ref:`examples <vllm-benchmark-run-benchmark-v043>` for more information.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -193,7 +193,7 @@ See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
|||||||
# pass your HF_TOKEN
|
# pass your HF_TOKEN
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone-options:
|
.. _vllm-benchmark-standalone-options-v043:
|
||||||
|
|
||||||
Options
|
Options
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
@@ -265,13 +265,13 @@ Options
|
|||||||
- ``float16``
|
- ``float16``
|
||||||
- Data type
|
- Data type
|
||||||
|
|
||||||
.. _vllm-benchmark-run-benchmark:
|
.. _vllm-benchmark-run-benchmark-v043:
|
||||||
|
|
||||||
Running the benchmark on the MI300X accelerator
|
Running the benchmark on the MI300X accelerator
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options.
|
Here are some examples of running the benchmark with various options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
Latency benchmark example
|
Latency benchmark example
|
||||||
@@ -322,22 +322,22 @@ Further reading
|
|||||||
===============
|
===============
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn more about the options for latency and throughput benchmark scripts,
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for ROCm, see the
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
|
|
||||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
might hang until the periodic balancing is finalized. For more information,
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -103,11 +103,11 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
Once setup is complete, you can choose between two options to reproduce the
|
Once setup is complete, you can choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v064>`
|
||||||
|
|
||||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
- :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>`
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-v064:
|
||||||
|
|
||||||
MAD-integrated benchmarking
|
MAD-integrated benchmarking
|
||||||
===========================
|
===========================
|
||||||
@@ -135,7 +135,7 @@ model are collected in the following path: ``~/MAD/reports_float16/``.
|
|||||||
|
|
||||||
Although the following models are preconfigured to collect latency and
|
Although the following models are preconfigured to collect latency and
|
||||||
throughput performance data, you can also change the benchmarking parameters.
|
throughput performance data, you can also change the benchmarking parameters.
|
||||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>` section.
|
||||||
|
|
||||||
Available models
|
Available models
|
||||||
----------------
|
----------------
|
||||||
@@ -177,7 +177,7 @@ Available models
|
|||||||
|
|
||||||
* ``pyt_vllm_mixtral-8x22b_fp8``
|
* ``pyt_vllm_mixtral-8x22b_fp8``
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone:
|
.. _vllm-benchmark-standalone-v064:
|
||||||
|
|
||||||
Standalone benchmarking
|
Standalone benchmarking
|
||||||
=======================
|
=======================
|
||||||
@@ -203,14 +203,14 @@ Command
|
|||||||
-------
|
-------
|
||||||
|
|
||||||
To start the benchmark, use the following command with the appropriate options.
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||||
|
|
||||||
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
See the :ref:`examples <vllm-benchmark-run-benchmark-v064>` for more information.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -229,7 +229,7 @@ See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
|||||||
# pass your HF_TOKEN
|
# pass your HF_TOKEN
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone-options:
|
.. _vllm-benchmark-standalone-v064-options:
|
||||||
|
|
||||||
Options
|
Options
|
||||||
-------
|
-------
|
||||||
@@ -330,13 +330,13 @@ Options
|
|||||||
- ``float16`` or ``float8``
|
- ``float16`` or ``float8``
|
||||||
- Data type
|
- Data type
|
||||||
|
|
||||||
.. _vllm-benchmark-run-benchmark:
|
.. _vllm-benchmark-run-benchmark-v064:
|
||||||
|
|
||||||
Running the benchmark on the MI300X accelerator
|
Running the benchmark on the MI300X accelerator
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options.
|
Here are some examples of running the benchmark with various options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
Example 1: latency benchmark
|
Example 1: latency benchmark
|
||||||
@@ -392,25 +392,22 @@ Further reading
|
|||||||
===============
|
===============
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn more about the options for latency and throughput benchmark scripts,
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for ROCm, see the
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
|
|
||||||
`LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
|
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -31,8 +31,8 @@ accelerator and includes the following components:
|
|||||||
With this Docker image, you can quickly validate the expected inference
|
With this Docker image, you can quickly validate the expected inference
|
||||||
performance numbers for the MI300X accelerator. This topic also provides tips on
|
performance numbers for the MI300X accelerator. This topic also provides tips on
|
||||||
optimizing performance with popular AI models. For more information, see the lists of
|
optimizing performance with popular AI models. For more information, see the lists of
|
||||||
:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
|
:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-v066-models>`
|
||||||
and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
|
and :ref:`standalone benchmarking <vllm-benchmark-standalone-v066-options>`.
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm:
|
||||||
|
|
||||||
@@ -55,7 +55,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
|
|
||||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
might hang until the periodic balancing is finalized. For more information,
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -76,11 +76,11 @@ MI300X accelerator with the prebuilt vLLM Docker image.
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
|
- :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v066>`
|
||||||
|
|
||||||
- :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
|
- :ref:`Standalone benchmarking <vllm-benchmark-standalone-v066>`
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-v066:
|
||||||
|
|
||||||
MAD-integrated benchmarking
|
MAD-integrated benchmarking
|
||||||
===========================
|
===========================
|
||||||
@@ -108,9 +108,9 @@ model are collected in the following path: ``~/MAD/reports_float16/``.
|
|||||||
|
|
||||||
Although the following models are preconfigured to collect latency and
|
Although the following models are preconfigured to collect latency and
|
||||||
throughput performance data, you can also change the benchmarking parameters.
|
throughput performance data, you can also change the benchmarking parameters.
|
||||||
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
|
Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v066>` section.
|
||||||
|
|
||||||
.. _vllm-benchmark-mad-models:
|
.. _vllm-benchmark-mad-v066-models:
|
||||||
|
|
||||||
Available models
|
Available models
|
||||||
----------------
|
----------------
|
||||||
@@ -134,10 +134,10 @@ Available models
|
|||||||
* - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
* - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
|
||||||
- ``pyt_vllm_llama-3.2-11b-vision-instruct``
|
- ``pyt_vllm_llama-3.2-11b-vision-instruct``
|
||||||
|
|
||||||
* - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
* - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
|
||||||
- ``pyt_vllm_llama-2-7b``
|
- ``pyt_vllm_llama-2-7b``
|
||||||
|
|
||||||
* - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
* - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
|
||||||
- ``pyt_vllm_llama-2-70b``
|
- ``pyt_vllm_llama-2-70b``
|
||||||
|
|
||||||
* - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
* - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
|
||||||
@@ -194,7 +194,7 @@ Available models
|
|||||||
* - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
* - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
|
||||||
- ``pyt_vllm_command-r-plus_fp8``
|
- ``pyt_vllm_command-r-plus_fp8``
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone:
|
.. _vllm-benchmark-standalone-v066:
|
||||||
|
|
||||||
Standalone benchmarking
|
Standalone benchmarking
|
||||||
=======================
|
=======================
|
||||||
@@ -220,14 +220,14 @@ Command
|
|||||||
-------
|
-------
|
||||||
|
|
||||||
To start the benchmark, use the following command with the appropriate options.
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-v066-options>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
|
||||||
|
|
||||||
See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
See the :ref:`examples <vllm-benchmark-run-benchmark-v066>` for more information.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -246,7 +246,7 @@ See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
|
|||||||
# pass your HF_TOKEN
|
# pass your HF_TOKEN
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
.. _vllm-benchmark-standalone-options:
|
.. _vllm-benchmark-standalone-v066-options:
|
||||||
|
|
||||||
Options and available models
|
Options and available models
|
||||||
----------------------------
|
----------------------------
|
||||||
@@ -289,11 +289,11 @@ Options and available models
|
|||||||
|
|
||||||
* -
|
* -
|
||||||
- ``meta-llama/Llama-2-7b-chat-hf``
|
- ``meta-llama/Llama-2-7b-chat-hf``
|
||||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
|
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- ``meta-llama/Llama-2-70b-chat-hf``
|
- ``meta-llama/Llama-2-70b-chat-hf``
|
||||||
- `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
|
- `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
- ``mistralai/Mixtral-8x7B-Instruct-v0.1``
|
||||||
@@ -375,13 +375,13 @@ Options and available models
|
|||||||
- ``float16`` or ``float8``
|
- ``float16`` or ``float8``
|
||||||
- Data type
|
- Data type
|
||||||
|
|
||||||
.. _vllm-benchmark-run-benchmark:
|
.. _vllm-benchmark-run-benchmark-v066:
|
||||||
|
|
||||||
Running the benchmark on the MI300X accelerator
|
Running the benchmark on the MI300X accelerator
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options.
|
Here are some examples of running the benchmark with various options.
|
||||||
See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
|
See :ref:`Options <vllm-benchmark-standalone-v066-options>` for the list of
|
||||||
options and their descriptions.
|
options and their descriptions.
|
||||||
|
|
||||||
Example 1: latency benchmark
|
Example 1: latency benchmark
|
||||||
@@ -437,22 +437,22 @@ Further reading
|
|||||||
===============
|
===============
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn more about the options for latency and throughput benchmark scripts,
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see :doc:`../../system-optimization/mi300x`.
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -36,10 +36,10 @@ vLLM inference performance testing
|
|||||||
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-v073>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-v073:
|
||||||
|
|
||||||
Available models
|
Available models
|
||||||
================
|
================
|
||||||
@@ -95,7 +95,7 @@ vLLM inference performance testing
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-v073:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -109,7 +109,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
The performance data presented in
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
|
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
Advanced features and known issues
|
Advanced features and known issues
|
||||||
@@ -130,7 +130,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
might hang until the periodic balancing is finalized. For more information,
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -154,7 +154,7 @@ vLLM inference performance testing
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-v073:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -175,7 +175,7 @@ vLLM inference performance testing
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -186,7 +186,7 @@ vLLM inference performance testing
|
|||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models <vllm-benchmark-available-models-v073>` are preconfigured
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
@@ -264,7 +264,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Latency benchmark
|
* Latency benchmark
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the :literal:`{{model.precision}}` data type.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
@@ -274,7 +274,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
|
Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the :literal:`{{model.precision}}` data type.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -305,22 +305,22 @@ Further reading
|
|||||||
===============
|
===============
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn more about the options for latency and throughput benchmark scripts,
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
ROCm vLLM Docker image.
|
ROCm vLLM Docker image.
|
||||||
@@ -29,10 +31,10 @@ vLLM inference performance testing
|
|||||||
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-v083>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-v083:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -88,7 +90,7 @@ vLLM inference performance testing
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-v083:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -102,7 +104,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
The performance data presented in
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
|
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
Advanced features and known issues
|
Advanced features and known issues
|
||||||
@@ -170,7 +172,7 @@ vLLM inference performance testing
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -181,7 +183,7 @@ vLLM inference performance testing
|
|||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models <vllm-benchmark-available-models-v083>` are preconfigured
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
@@ -278,7 +280,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Latency benchmark
|
* Latency benchmark
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
@@ -288,7 +290,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -319,22 +321,22 @@ Further reading
|
|||||||
===============
|
===============
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn more about the options for latency and throughput benchmark scripts,
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -36,10 +36,10 @@ vLLM inference performance testing
|
|||||||
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-v085-20250513>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-v085-20250513:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -99,7 +99,7 @@ vLLM inference performance testing
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-v085-20250513:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -113,7 +113,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
The performance data presented in
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
|
only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
Advanced features and known issues
|
Advanced features and known issues
|
||||||
@@ -181,7 +181,7 @@ vLLM inference performance testing
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -192,7 +192,7 @@ vLLM inference performance testing
|
|||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models <vllm-benchmark-available-models-v085-20250513>` are preconfigured
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
@@ -289,7 +289,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Latency benchmark
|
* Latency benchmark
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
@@ -299,7 +299,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -333,19 +333,19 @@ Further reading
|
|||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <../../hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../../../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -36,10 +36,10 @@ vLLM inference performance testing
|
|||||||
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-v085-20250521>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-v085-20250521:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -99,7 +99,7 @@ vLLM inference performance testing
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-v085-20250521:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -181,7 +181,7 @@ vLLM inference performance testing
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -192,7 +192,7 @@ vLLM inference performance testing
|
|||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models <vllm-benchmark-available-models-v085-20250521>` are preconfigured
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
@@ -289,7 +289,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Latency benchmark
|
* Latency benchmark
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
@@ -299,7 +299,7 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -333,22 +333,23 @@ Further reading
|
|||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <../hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|
||||||
See :doc:`vllm-history` to find documentation for previous releases
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
of the ``ROCm/vllm`` Docker image.
|
of the ``ROCm/vllm`` Docker image.
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,353 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
|
||||||
|
|
||||||
|
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
|
||||||
|
|
||||||
|
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
|
||||||
|
|
||||||
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-v0901-20250605>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-v0901-20250605:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-v0901-20250605:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
Advanced features and known issues
|
||||||
|
==================================
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# disable automatic NUMA balancing
|
||||||
|
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||||
|
cat /proc/sys/kernel/numa_balancing
|
||||||
|
0
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
|
Although the :ref:`available models <vllm-benchmark-available-models-v0901-20250605>` are preconfigured
|
||||||
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
||||||
|
(see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
|
||||||
|
enable it, edit the default run behavior in the ``models.json``
|
||||||
|
configuration before running inference -- update the model's run
|
||||||
|
``args`` by changing ``--tunableop off`` to ``--tunableop on``.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options.
|
||||||
|
|
||||||
|
* Latency benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||||
|
|
||||||
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||||
|
|
||||||
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -0,0 +1,353 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
|
||||||
|
|
||||||
|
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
|
||||||
|
|
||||||
|
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
|
||||||
|
|
||||||
|
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-20250702:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-20250702:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
Advanced features and known issues
|
||||||
|
==================================
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
||||||
|
might hang until the periodic balancing is finalized. For more information,
|
||||||
|
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# disable automatic NUMA balancing
|
||||||
|
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
# check if NUMA balancing is disabled (returns 0 if disabled)
|
||||||
|
cat /proc/sys/kernel/numa_balancing
|
||||||
|
0
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
|
Although the :ref:`available models <vllm-benchmark-available-models-20250702>` are preconfigured
|
||||||
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
||||||
|
(see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
|
||||||
|
enable it, edit the default run behavior in the ``models.json``
|
||||||
|
configuration before running inference -- update the model's run
|
||||||
|
``args`` by changing ``--tunableop off`` to ``--tunableop on``.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options.
|
||||||
|
|
||||||
|
* Latency benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||||
|
|
||||||
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
||||||
|
|
||||||
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -7,69 +7,103 @@ vLLM inference performance testing version history
|
|||||||
This table lists previous versions of the ROCm vLLM inference Docker image for
|
This table lists previous versions of the ROCm vLLM inference Docker image for
|
||||||
inference performance testing. For detailed information about available models
|
inference performance testing. For detailed information about available models
|
||||||
for benchmarking, see the version-specific documentation. You can find tagged
|
for benchmarking, see the version-specific documentation. You can find tagged
|
||||||
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`_.
|
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
:stub-columns: 1
|
|
||||||
|
|
||||||
* - ROCm version
|
* - Docker image tag
|
||||||
- vLLM version
|
- Components
|
||||||
- PyTorch version
|
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - 6.4.0
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
||||||
- 0.9.0.1
|
(latest)
|
||||||
- 2.7.0
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.9.1
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../vllm>`
|
* :doc:`Documentation <../vllm>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
||||||
|
|
||||||
* - 6.3.1
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
||||||
- 0.8.5 (0.8.6.dev)
|
-
|
||||||
- 2.7.0
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.9.1
|
||||||
|
* PyTorch 2.7.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <vllm-0.9.1-20250702>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__
|
||||||
|
|
||||||
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
|
||||||
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.9.0.1
|
||||||
|
* PyTorch 2.7.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <vllm-0.9.0.1-20250605>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`__
|
||||||
|
|
||||||
|
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
|
||||||
|
-
|
||||||
|
* ROCm 6.3.1
|
||||||
|
* 0.8.5 vLLM (0.8.6.dev)
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.8.5-20250521>`
|
* :doc:`Documentation <vllm-0.8.5-20250521>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__
|
||||||
|
|
||||||
* - 6.3.1
|
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
|
||||||
- 0.8.5
|
-
|
||||||
- 2.7.0
|
* ROCm 6.3.1
|
||||||
|
* vLLM 0.8.5
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.8.5-20250513>`
|
* :doc:`Documentation <vllm-0.8.5-20250513>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__
|
||||||
|
|
||||||
* - 6.3.1
|
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
|
||||||
- 0.8.3
|
-
|
||||||
- 2.7.0
|
* ROCm 6.3.1
|
||||||
|
* vLLM 0.8.3
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.8.3-20250415>`
|
* :doc:`Documentation <vllm-0.8.3-20250415>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__
|
||||||
|
|
||||||
* - 6.3.1
|
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
|
||||||
- 0.7.3
|
-
|
||||||
- 2.7.0
|
* ROCm 6.3.1
|
||||||
|
* vLLM 0.7.3
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.7.3-20250325>`
|
* :doc:`Documentation <vllm-0.7.3-20250325>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__
|
||||||
|
|
||||||
* - 6.3.1
|
* - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
|
||||||
- 0.6.6
|
-
|
||||||
- 2.7.0
|
* ROCm 6.3.1
|
||||||
|
* vLLM 0.6.6
|
||||||
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.6.6>`
|
* :doc:`Documentation <vllm-0.6.6>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__
|
||||||
|
|
||||||
* - 6.2.1
|
* - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
|
||||||
- 0.6.4
|
-
|
||||||
- 2.5.0
|
* ROCm 6.2.1
|
||||||
|
* vLLM 0.6.4
|
||||||
|
* PyTorch 2.5.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.6.4>`
|
* :doc:`Documentation <vllm-0.6.4>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__
|
||||||
|
|
||||||
* - 6.2.0
|
* - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
|
||||||
- 0.4.3
|
-
|
||||||
- 2.4.0
|
* ROCm 6.2.0
|
||||||
|
* vLLM 0.4.3
|
||||||
|
* PyTorch 2.4.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <vllm-0.4.3>`
|
* :doc:`Documentation <vllm-0.4.3>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__
|
||||||
|
|
||||||
|
|||||||
@@ -32,10 +32,10 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -93,7 +93,7 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
.. container:: model-doc pyt_chai1_inference
|
.. container:: model-doc pyt_chai1_inference
|
||||||
|
|
||||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`_ from Docker Hub.
|
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue/images/sha256-b736a4239ab38a9d0e448af6d4adca83b117debed00bfbe33846f99c4540f79b>`__ from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -103,9 +103,9 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||||
|
|
||||||
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference
|
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference
|
||||||
|
|
||||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
|
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
|
|||||||
@@ -20,23 +20,55 @@ vLLM inference performance testing
|
|||||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
accelerators and includes the following components:
|
accelerators and includes the following components:
|
||||||
|
|
||||||
* `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
* `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
* `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
* `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
- {{ unified_docker.pytorch_version }}
|
||||||
MI300X series accelerators.
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||||
|
|
||||||
|
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Fixed a ``+rms_norm`` custom kernel issue.
|
||||||
|
|
||||||
|
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
||||||
|
|
||||||
|
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models:
|
||||||
|
|
||||||
Supported models
|
|
||||||
================
|
|
||||||
|
|
||||||
The following models are supported for inference performance benchmarking
|
The following models are supported for inference performance benchmarking
|
||||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
documentation might vary by model -- select one to get started.
|
documentation might vary by model -- select one to get started.
|
||||||
@@ -44,18 +76,18 @@ vLLM inference performance testing
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row mt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
@@ -66,8 +98,8 @@ vLLM inference performance testing
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm:
|
||||||
@@ -85,56 +117,48 @@ vLLM inference performance testing
|
|||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
page provides reference throughput and latency measurements for inferencing popular AI models.
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
.. important::
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
The performance data presented in
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
correctly and performing optimally.
|
||||||
only reflects the latest version of this inference benchmarking environment.
|
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
|
||||||
|
|
||||||
Advanced features and known issues
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
==================================
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
System validation
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
=================
|
|
||||||
|
|
||||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
correctly and performing optimally.
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
|
|
||||||
might hang until the periodic balancing is finalized. For more information,
|
|
||||||
see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
# disable automatic NUMA balancing
|
|
||||||
sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
|
||||||
# check if NUMA balancing is disabled (returns 0 if disabled)
|
|
||||||
cat /proc/sys/kernel/numa_balancing
|
|
||||||
0
|
|
||||||
|
|
||||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
|
||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
|
||||||
system's configuration.
|
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
@@ -163,22 +187,26 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
git clone https://github.com/ROCm/MAD
|
||||||
cd MAD
|
cd MAD
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
using one GPU with the ``{{model.precision}}`` data type on the host machine.
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
|
python3 tools/run_models.py \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
@@ -209,93 +237,125 @@ vLLM inference performance testing
|
|||||||
|
|
||||||
.. tab-item:: Standalone benchmarking
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
Run the vLLM benchmark tool independently by starting the
|
.. rubric:: Download the Docker image and required scripts
|
||||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
|
||||||
as shown in the following snippet.
|
|
||||||
|
|
||||||
.. code-block::
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
.. code-block:: shell
|
||||||
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
|
|
||||||
|
|
||||||
In the Docker container, clone the ROCm MAD repository and navigate to the
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
.. code-block::
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
.. code-block:: shell
|
||||||
cd MAD/scripts/vllm
|
|
||||||
|
|
||||||
To start the benchmark, use the following command with the appropriate options.
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
.. code-block::
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
:align: center
|
:align: center
|
||||||
|
|
||||||
* - Name
|
* - Name
|
||||||
- Options
|
- Options
|
||||||
- Description
|
- Description
|
||||||
|
|
||||||
* - ``$test_option``
|
* - ``$test_option``
|
||||||
- latency
|
- latency
|
||||||
- Measure decoding token latency
|
- Measure decoding token latency
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- throughput
|
- throughput
|
||||||
- Measure token generation throughput
|
- Measure token generation throughput
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- all
|
- all
|
||||||
- Measure both throughput and latency
|
- Measure both throughput and latency
|
||||||
|
|
||||||
* - ``$num_gpu``
|
* - ``$num_gpu``
|
||||||
- 1 or 8
|
- 1 or 8
|
||||||
- Number of GPUs
|
- Number of GPUs
|
||||||
|
|
||||||
* - ``$datatype``
|
* - ``$datatype``
|
||||||
- ``float16`` or ``float8``
|
- ``float16`` or ``float8``
|
||||||
- Data type
|
- Data type
|
||||||
|
|
||||||
.. note::
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
Command:
|
||||||
already configured. You don't need to specify them with this script.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
If you encounter the following error, pass your access-authorized Hugging
|
|
||||||
Face token to the gated models.
|
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
OSError: You are trying to access a gated repo.
|
./vllm_benchmark_report.sh \
|
||||||
|
-s $test_option \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g $num_gpu \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
# pass your HF_TOKEN
|
.. note::
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options.
|
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
* Latency benchmark
|
* Latency benchmark
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
./vllm_benchmark_report.sh \
|
||||||
|
-s latency \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
|
./vllm_benchmark_report.sh \
|
||||||
|
-s throughput \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
@@ -318,6 +378,41 @@ vLLM inference performance testing
|
|||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Known issues and workarounds
|
||||||
|
============================
|
||||||
|
|
||||||
|
AITER does not support FP8 KV cache yet.
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|
||||||
@@ -325,22 +420,22 @@ Further reading
|
|||||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`../../inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn how to run LLM models from Hugging Face or your own model, see
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
:doc:`Running models from Hugging Face <../hugging-face-models>`.
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
- To learn how to optimize inference on LLMs, see
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
:doc:`Inference optimization <../../inference-optimization/index>`.
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
:doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|
||||||
See :doc:`previous-versions/vllm-history` to find documentation for previous releases
|
See :doc:`previous-versions/vllm-history` to find documentation for previous releases
|
||||||
of the ``ROCm/vllm`` Docker image.
|
of the ``ROCm/vllm`` Docker image.
|
||||||
|
|||||||
@@ -14,14 +14,14 @@ Throughout the following topics, this section provides a comprehensive guide to
|
|||||||
The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
|
The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
|
||||||
training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.
|
training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.
|
||||||
|
|
||||||
- :doc:`Installing ROCm and machine learning frameworks <install>`
|
- :doc:`Installing ROCm and machine learning frameworks <../install>`
|
||||||
|
|
||||||
- :doc:`Running models from Hugging Face <hugging-face-models>`
|
- :doc:`Running models from Hugging Face <hugging-face-models>`
|
||||||
|
|
||||||
- :doc:`LLM inference frameworks <llm-inference-frameworks>`
|
- :doc:`LLM inference frameworks <llm-inference-frameworks>`
|
||||||
|
|
||||||
- :doc:`vLLM inference performance testing <vllm-benchmark>`
|
- :doc:`vLLM inference performance testing <benchmark-docker/vllm>`
|
||||||
|
|
||||||
- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`
|
- :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`
|
||||||
|
|
||||||
- :doc:`Deploying your model <deploy-your-model>`
|
- :doc:`Deploying your model <deploy-your-model>`
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ Installing vLLM
|
|||||||
|
|
||||||
ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
|
ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
|
||||||
on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
|
on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
|
||||||
For more information, see :doc:`vllm-benchmark`.
|
For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
|
||||||
|
|
||||||
.. _fine-tuning-llms-tgi:
|
.. _fine-tuning-llms-tgi:
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install
|
|||||||
|
|
||||||
* :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`
|
* :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`
|
||||||
|
|
||||||
|
* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`
|
||||||
|
|
||||||
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
|
* :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`
|
||||||
|
|
||||||
.. grid:: 1
|
.. grid:: 1
|
||||||
|
|||||||
@@ -15,57 +15,51 @@ purpose-built to support models like Llama, DeepSeek, and Mixtral,
|
|||||||
enabling developers to train next-generation AI models more
|
enabling developers to train next-generation AI models more
|
||||||
efficiently.
|
efficiently.
|
||||||
|
|
||||||
AMD provides a ready-to-use Docker image for MI300X series accelerators containing
|
AMD provides ready-to-use Docker images for MI300X series accelerators containing
|
||||||
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||||
utilities. It contains the following software components to accelerate training
|
utilities. It contains the following software components to accelerate training
|
||||||
workloads:
|
workloads:
|
||||||
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Software component | Version |
|
|
||||||
+==========================+================================+
|
|
||||||
| ROCm | 6.3.4 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| PyTorch | 2.8.0a0+gite2f9759 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Python | 3.12 or 3.10 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Transformer Engine | 1.13.0+bb061ade |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Flash Attention | 3.0.0 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| hipBLASLt | 0.13.0-4f18bf6 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Triton | 3.3.0 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| RCCL | 2.22.3 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
|
|
||||||
Megatron-LM provides the following key features to train large language models efficiently:
|
|
||||||
|
|
||||||
- Transformer Engine (TE)
|
|
||||||
|
|
||||||
- APEX
|
|
||||||
|
|
||||||
- GEMM tuning
|
|
||||||
|
|
||||||
- Torch.compile
|
|
||||||
|
|
||||||
- 3D parallelism: TP + SP + CP
|
|
||||||
|
|
||||||
- Distributed optimizer
|
|
||||||
|
|
||||||
- Flash Attention (FA) 3
|
|
||||||
|
|
||||||
- Fused kernels
|
|
||||||
|
|
||||||
- Pre-training
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% if dockers|length > 1 %}
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
{% for docker in data.dockers %}
|
||||||
|
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
{% elif dockers|length == 1 %}
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-model-support:
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
@@ -73,8 +67,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
|
|||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
{% set model_groups = data["megatron-lm_benchmark"].model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
@@ -82,7 +75,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
|
|||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -115,14 +108,14 @@ Performance measurements
|
|||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
|
||||||
page provides reference throughput and latency measurements for training
|
page provides reference throughput and latency measurements for training
|
||||||
popular AI models.
|
popular AI models.
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
The performance data presented in
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
|
||||||
only reflects the latest version of this training benchmarking environment.
|
only reflects the latest version of this training benchmarking environment.
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
@@ -155,27 +148,77 @@ image.
|
|||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
1. Use the following command to pull the Docker image from Docker Hub.
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
.. tab-set::
|
{% set dockers = data.dockers %}
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
.. tab-item:: Ubuntu 24.04 + Python 3.12
|
{% if dockers|length > 1 %}
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
.. code-block:: shell
|
{% for docker in data.dockers %}
|
||||||
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
docker pull rocm/megatron-lm:v25.5_py312
|
.. code-block:: shell
|
||||||
|
|
||||||
.. tab-item:: Ubuntu 22.04 + Python 3.10
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
.. code-block:: shell
|
{% endfor %}
|
||||||
|
{% elif dockers|length == 1 %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull rocm/megatron-lm:v25.5_py310
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
2. Launch the Docker container.
|
{% endif %}
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
.. code-block:: shell
|
{% if dockers|length > 1 %}
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.5
|
{% for docker in data.dockers %}
|
||||||
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
-v $HOME/.ssh:/root/.ssh \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name megatron_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% elif dockers|length == 1 %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
-v $HOME/.ssh:/root/.ssh \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name megatron_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||||
|
|
||||||
@@ -333,6 +376,22 @@ If the tokenizer is not found, it'll be downloaded if publicly available.
|
|||||||
|
|
||||||
TOKENIZER_MODEL=tokenizer/tokenizer.model
|
TOKENIZER_MODEL=tokenizer/tokenizer.model
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="Qwen/Qwen2.5-7B"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="Qwen/Qwen2.5-72B"
|
||||||
|
|
||||||
Dataset options
|
Dataset options
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
@@ -358,7 +417,7 @@ You can use either mock data or real data for training.
|
|||||||
Download the dataset
|
Download the dataset
|
||||||
^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
|
||||||
For Llama models, use the `prepare_dataset.sh
|
For Llama models, use the `prepare_dataset.sh
|
||||||
<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
|
<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
|
||||||
@@ -397,8 +456,8 @@ Download the dataset
|
|||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
cd ..
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
|
||||||
|
|
||||||
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
@@ -422,8 +481,8 @@ Download the dataset
|
|||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
cd ..
|
||||||
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
|
||||||
|
|
||||||
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
@@ -433,8 +492,6 @@ Download the dataset
|
|||||||
|
|
||||||
DATA_DIR="<path-to>/deepseek-datasets" # Change to where your dataset is stored
|
DATA_DIR="<path-to>/deepseek-datasets" # Change to where your dataset is stored
|
||||||
|
|
||||||
Ensure that the files are accessible inside the Docker container.
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
If you don't already have the dataset, download the Mixtral dataset using the following
|
If you don't already have the dataset, download the Mixtral dataset using the following
|
||||||
@@ -457,6 +514,27 @@ Download the dataset
|
|||||||
|
|
||||||
Ensure that the files are accessible inside the Docker container.
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
If you don't already have the dataset, download the Mixtral dataset using the following
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
mkdir -p temp/qwen-datasets
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx
|
||||||
|
|
||||||
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=0 # Train on real data
|
||||||
|
|
||||||
|
DATA_DIR="<path-to>/qwen-datasets" # Change to where your dataset is stored
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
Multi-node configuration
|
Multi-node configuration
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
@@ -497,27 +575,17 @@ also be passed as command line arguments. Refer to the following example configu
|
|||||||
# Specify which RDMA interfaces to use for communication
|
# Specify which RDMA interfaces to use for communication
|
||||||
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||||
|
|
||||||
Getting started
|
|
||||||
===============
|
|
||||||
|
|
||||||
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
|
|
||||||
system performance, conduct training benchmarks, and achieve superior
|
|
||||||
performance for models like Llama, DeepSeek, and Mixtral. This container should not be
|
|
||||||
expected to provide generalized performance across all training workloads. You
|
|
||||||
can expect the container to perform in the model configurations described in
|
|
||||||
the following section, but other configurations are not validated by AMD.
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-run-training:
|
.. _amd-megatron-lm-run-training:
|
||||||
|
|
||||||
Run training
|
Run training
|
||||||
------------
|
============
|
||||||
|
|
||||||
Use the following example commands to set up the environment, configure
|
Use the following example commands to set up the environment, configure
|
||||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
|
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
|
||||||
MI300X series accelerators with the AMD Megatron-LM environment.
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
Single node training
|
Single node training
|
||||||
^^^^^^^^^^^^^^^^^^^^
|
--------------------
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
@@ -526,7 +594,20 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \
|
||||||
|
CKPT_FORMAT=torch_dist \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
RECOMPUTE=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MBS=2 \
|
||||||
|
BS=16 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
TP=1 \
|
||||||
|
PP=1 \
|
||||||
|
FSDP=1 \
|
||||||
|
MODEL_SIZE=70 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -535,8 +616,6 @@ Single node training
|
|||||||
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
or FP16.
|
or FP16.
|
||||||
|
|
||||||
Currently, FSDP is only compatible with BF16 precision.
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
|
To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
|
||||||
@@ -544,13 +623,29 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=2 \
|
||||||
|
BS=128 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=8 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
For Llama 3.1 8B BF16, use the following command:
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=2 \
|
||||||
|
BS=128 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=8 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
@@ -559,7 +654,18 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
CKPT_FORMAT=torch_dist \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=3 \
|
||||||
|
BS=24 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
FSDP=1 \
|
||||||
|
RECOMPUTE=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=70 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -568,7 +674,36 @@ Single node training
|
|||||||
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
or FP16.
|
or FP16.
|
||||||
|
|
||||||
Currently, FSDP is only compatible with BF16 precision.
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B with proxy, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
CKPT_FORMAT=torch_dist \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
RECOMPUTE=1 \
|
||||||
|
MBS=3 \
|
||||||
|
BS=24 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=70 \
|
||||||
|
FSDP=1 \
|
||||||
|
TOTAL_ITERS=10 \
|
||||||
|
NUM_LAYERS=40 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It is suggested to use ``TP=1`` when FSDP is enabled for higher
|
||||||
|
throughput. FSDP-v2 is not supported with pipeline parallelism, expert
|
||||||
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
|
or FP16.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
@@ -577,13 +712,29 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=4 \
|
||||||
|
BS=256 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
SEQ_LENGTH=4096 \
|
||||||
|
MODEL_SIZE=7 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
For Llama 2 7B BF16, use the following command:
|
For Llama 2 7B BF16, use the following command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=4 \
|
||||||
|
BS=256 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
SEQ_LENGTH=4096 \
|
||||||
|
MODEL_SIZE=7 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
@@ -592,7 +743,18 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
CKPT_FORMAT=torch_dist \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=7 \
|
||||||
|
BS=56 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
FSDP=1 \
|
||||||
|
RECOMPUTE=1 \
|
||||||
|
SEQ_LENGTH=4096 \
|
||||||
|
MODEL_SIZE=70 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -601,8 +763,6 @@ Single node training
|
|||||||
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
or FP16.
|
or FP16.
|
||||||
|
|
||||||
Currently, FSDP is only compatible with BF16 precision.
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
@@ -610,7 +770,8 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
FORCE_BANLANCE=true \
|
export NVTE_FUSED_ATTN_CK=0
|
||||||
|
FORCE_BALANCE=true \
|
||||||
RUN_ENV=cluster \
|
RUN_ENV=cluster \
|
||||||
MODEL_SIZE=671B \
|
MODEL_SIZE=671B \
|
||||||
TRAIN_ITERS=50 \
|
TRAIN_ITERS=50 \
|
||||||
@@ -632,7 +793,15 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
|
export NVTE_FUSED_ATTN_CK=0
|
||||||
|
GEMM_TUNING=1 \
|
||||||
|
PR=bf16 \
|
||||||
|
MBS=4 \
|
||||||
|
AC=none \
|
||||||
|
SEQ_LEN=4096 \
|
||||||
|
PAD_LEN=4096 \
|
||||||
|
TRAIN_ITERS=50 \
|
||||||
|
bash examples/deepseek_v2/train_deepseekv2.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
@@ -641,7 +810,24 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
|
TOKENIZER_MODEL=<path/to/tokenizer/model>
|
||||||
|
RECOMPUTE_NUM_LAYERS=0 \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=1 \
|
||||||
|
GBS=16 \
|
||||||
|
TP_SIZE=1 \
|
||||||
|
PP_SIZE=1 \
|
||||||
|
AC=none \
|
||||||
|
PR=bf16 \
|
||||||
|
EP_SIZE=8 \
|
||||||
|
ETP_SIZE=1 \
|
||||||
|
SEQLEN=4096 \
|
||||||
|
FORCE_BALANCE=true \
|
||||||
|
MOCK_DATA=1 \
|
||||||
|
RUN_ENV=cluster \
|
||||||
|
MODEL_SIZE=8x7B \
|
||||||
|
TRAIN_ITERS=50 \
|
||||||
|
bash examples/mixtral/train_mixtral_moe.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
@@ -650,10 +836,85 @@ Single node training
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
|
TOKENIZER_MODEL=<path/to/tokenizer/model>
|
||||||
|
RECOMPUTE_NUM_LAYERS=4 \
|
||||||
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=1 \
|
||||||
|
GBS=16 \
|
||||||
|
TP_SIZE=1 \
|
||||||
|
PP_SIZE=1 \
|
||||||
|
AC=full \
|
||||||
|
NUM_LAYERS=4 \
|
||||||
|
PR=bf16 \
|
||||||
|
EP_SIZE=8 \
|
||||||
|
ETP_SIZE=1 \
|
||||||
|
SEQLEN=8192 \
|
||||||
|
FORCE_BALANCE=true \
|
||||||
|
MOCK_DATA=1 \
|
||||||
|
RUN_ENV=cluster \
|
||||||
|
MODEL_SIZE=8x22B \
|
||||||
|
TRAIN_ITERS=50 \
|
||||||
|
bash examples/mixtral/train_mixtral_moe.sh
|
||||||
|
|
||||||
Multi-node training
|
.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
|
||||||
^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
|
command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
bash examples/qwen/train_qwen2.sh TP=1 \
|
||||||
|
CP=1 \
|
||||||
|
PP=1 \
|
||||||
|
MBS=10 \
|
||||||
|
BS=640 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
MODEL_SIZE=7 \
|
||||||
|
SEQ_LENGTH=2048 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
MOCK_DATA=1 \
|
||||||
|
TOKENIZER_MODEL=Qwen/Qwen2.5-7B
|
||||||
|
|
||||||
|
For FP8, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
bash examples/qwen/train_qwen2.sh \
|
||||||
|
TP=1 \
|
||||||
|
CP=1 \
|
||||||
|
PP=1 \
|
||||||
|
MBS=10 \
|
||||||
|
BS=640 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
MODEL_SIZE=7 \
|
||||||
|
SEQ_LENGTH=2048 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
MOCK_DATA=1 \
|
||||||
|
TOKENIZER_MODEL=Qwen/Qwen2.5-7B
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
bash examples/qwen/train_qwen2.sh \
|
||||||
|
FSDP=1 \
|
||||||
|
CP=1 \
|
||||||
|
PP=1 \
|
||||||
|
MBS=3 \
|
||||||
|
BS=24 \
|
||||||
|
TE_FP8=0 \
|
||||||
|
MODEL_SIZE=72 \
|
||||||
|
SEQ_LENGTH=2048 \
|
||||||
|
TOTAL_ITERS=50 \
|
||||||
|
MOCK_DATA=1 \
|
||||||
|
TOKENIZER_MODEL=Qwen/Qwen2.5-72B \
|
||||||
|
RECOMPUTE_ACTIVATIONS=full \
|
||||||
|
CKPT_FORMAT=torch_dist
|
||||||
|
|
||||||
|
Multi-node training examples
|
||||||
|
----------------------------
|
||||||
|
|
||||||
To run training on multiple nodes, launch the Docker container on each node.
|
To run training on multiple nodes, launch the Docker container on each node.
|
||||||
For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
|
For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
|
||||||
@@ -663,13 +924,33 @@ use these commands.
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=2 \
|
||||||
|
BS=256 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=8 \
|
||||||
|
MASTER_ADDR=IP_NODE0 \
|
||||||
|
NNODES=2 \
|
||||||
|
NODE_RANK=0 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
* On the worker node ``NODE1``:
|
* On the worker node ``NODE1``:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
|
TEE_OUTPUT=1 \
|
||||||
|
MBS=2 \
|
||||||
|
BS=256 \
|
||||||
|
TP=1 \
|
||||||
|
TE_FP8=1 \
|
||||||
|
SEQ_LENGTH=8192 \
|
||||||
|
MODEL_SIZE=8 \
|
||||||
|
MASTER_ADDR=IP_NODE0 \
|
||||||
|
NNODES=2 \
|
||||||
|
NODE_RANK=1 \
|
||||||
|
bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
|
Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
|
||||||
provided in
|
provided in
|
||||||
|
|||||||
@@ -12,23 +12,23 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
|
|||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
:stub-columns: 1
|
|
||||||
|
|
||||||
* - Image version
|
* - Image version
|
||||||
- ROCm version
|
- Components
|
||||||
- JAX version
|
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - 25.5
|
* - 25.5 (latest)
|
||||||
- 6.3.4
|
-
|
||||||
- 0.4.35
|
* ROCm 6.3.4
|
||||||
|
* JAX 0.4.35
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../jax-maxtext>`
|
* :doc:`Documentation <../jax-maxtext>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
|
||||||
|
|
||||||
* - 25.4
|
* - 25.4
|
||||||
- 6.3.0
|
-
|
||||||
- 0.4.31
|
* ROCm 6.3.0
|
||||||
|
* JAX 0.4.31
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <jax-maxtext-v25.4>`
|
* :doc:`Documentation <jax-maxtext-v25.4>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`__
|
||||||
|
|||||||
@@ -96,14 +96,14 @@ This Docker image is optimized for specific model configurations outlined
|
|||||||
as follows. Performance can vary for other training workloads, as AMD
|
as follows. Performance can vary for other training workloads, as AMD
|
||||||
doesn’t validate configurations and run conditions outside those described.
|
doesn’t validate configurations and run conditions outside those described.
|
||||||
|
|
||||||
.. _amd-maxtext-multi-node-setup:
|
.. _amd-maxtext-multi-node-setup-v254:
|
||||||
|
|
||||||
Multi-node setup
|
Multi-node setup
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
For multi-node environments, ensure you have all the necessary packages for
|
For multi-node environments, ensure you have all the necessary packages for
|
||||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v254`.
|
||||||
|
|
||||||
1. Install the following packages to build and install the RDMA driver.
|
1. Install the following packages to build and install the RDMA driver.
|
||||||
|
|
||||||
@@ -168,7 +168,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
|||||||
|
|
||||||
e. RDMA interface
|
e. RDMA interface
|
||||||
|
|
||||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v254>` are installed on all nodes.
|
||||||
Then, set the RDMA interfaces to use for communication.
|
Then, set the RDMA interfaces to use for communication.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
@@ -178,7 +178,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
|||||||
# If using Mellanox NIC
|
# If using Mellanox NIC
|
||||||
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||||
|
|
||||||
.. _amd-maxtext-download-docker:
|
.. _amd-maxtext-download-docker-v254:
|
||||||
|
|
||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
@@ -195,7 +195,7 @@ Download the Docker image
|
|||||||
|
|
||||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
|
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
|
||||||
|
|
||||||
.. _amd-maxtext-get-started:
|
.. _amd-maxtext-get-started-v254:
|
||||||
|
|
||||||
Getting started
|
Getting started
|
||||||
===============
|
===============
|
||||||
|
|||||||
@@ -7,41 +7,53 @@ Megatron-LM training performance testing version history
|
|||||||
This table lists previous versions of the ROCm Megatron-LM training Docker image for
|
This table lists previous versions of the ROCm Megatron-LM training Docker image for
|
||||||
inference performance testing. For detailed information about available models
|
inference performance testing. For detailed information about available models
|
||||||
for benchmarking, see the version-specific documentation. You can find tagged
|
for benchmarking, see the version-specific documentation. You can find tagged
|
||||||
previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`_.
|
previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`__.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
:stub-columns: 1
|
|
||||||
|
|
||||||
* - Image version
|
* - Image version
|
||||||
- ROCm version
|
- Components
|
||||||
- PyTorch version
|
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.5
|
* - v25.6 (latest)
|
||||||
- 6.3.4
|
|
||||||
- 2.8.0a0+gite2f9759
|
|
||||||
-
|
-
|
||||||
* `Documentation <../megatron-lm>`_
|
* ROCm 6.4.1
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`_
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../megatron-lm>`
|
||||||
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
||||||
|
|
||||||
|
* - v25.5
|
||||||
|
-
|
||||||
|
* ROCm 6.3.4
|
||||||
|
* PyTorch 2.8.0a0+gite2f9759
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <megatron-lm-v25.5>`
|
||||||
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`__
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py310/images/sha256-743fbf1ceff7a44c4452f938d783a7abf143737d1c15b2b95f6f8a62e0fd048b>`__
|
||||||
|
|
||||||
* - v25.4
|
* - v25.4
|
||||||
- 6.3.0
|
-
|
||||||
- 2.7.0a0+git637433
|
* ROCm 6.3.0
|
||||||
|
* PyTorch 2.7.0a0+git637433
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <megatron-lm-v25.4>`
|
* :doc:`Documentation <megatron-lm-v25.4>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`__
|
||||||
|
|
||||||
* - v25.3
|
* - v25.3
|
||||||
- 6.3.0
|
-
|
||||||
- 2.7.0a0+git637433
|
* ROCm 6.3.0
|
||||||
|
* PyTorch 2.7.0a0+git637433
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <megatron-lm-v25.3>`
|
* :doc:`Documentation <megatron-lm-v25.3>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`__
|
||||||
|
|
||||||
* - v24.12-dev
|
* - v24.12-dev
|
||||||
- 6.1.0
|
-
|
||||||
- 2.4.0
|
* ROCm 6.1.0
|
||||||
|
* PyTorch 2.4.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <megatron-lm-v24.12-dev>`
|
* :doc:`Documentation <megatron-lm-v24.12-dev>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`__
|
||||||
|
|||||||
@@ -102,7 +102,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
|
|||||||
|
|
||||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
|
||||||
See :ref:`mi300x-disable-numa` for more information.
|
See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
|
||||||
|
for more information.
|
||||||
|
|
||||||
Hardware verification with ROCm
|
Hardware verification with ROCm
|
||||||
-------------------------------
|
-------------------------------
|
||||||
@@ -118,7 +119,7 @@ Run the command:
|
|||||||
|
|
||||||
rocm-smi --setperfdeterminism 1900
|
rocm-smi --setperfdeterminism 1900
|
||||||
|
|
||||||
See :ref:`mi300x-hardware-verification-with-rocm` for more information.
|
See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.
|
||||||
|
|
||||||
RCCL Bandwidth Test
|
RCCL Bandwidth Test
|
||||||
-------------------
|
-------------------
|
||||||
@@ -171,7 +172,7 @@ Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
|
|||||||
|
|
||||||
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
.. image:: /data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
|
||||||
@@ -181,7 +182,7 @@ recommended. So, a run on 8 GPUs looks something like:
|
|||||||
|
|
||||||
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
.. image:: /data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
|
||||||
@@ -202,10 +203,10 @@ Use the following script to run the RCCL test for four MI300X GPU nodes. Modify
|
|||||||
-x NCCL_DEBUG=version \
|
-x NCCL_DEBUG=version \
|
||||||
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
$HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
.. image:: /data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
.. _mi300x-amd-megatron-lm-training:
|
.. _mi300x-amd-megatron-lm-training-v2412:
|
||||||
|
|
||||||
Start training on MI300X accelerators
|
Start training on MI300X accelerators
|
||||||
=====================================
|
=====================================
|
||||||
@@ -217,7 +218,7 @@ Use the following instructions to set up the environment, configure the script t
|
|||||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||||
image.
|
image.
|
||||||
|
|
||||||
.. _amd-megatron-lm-requirements:
|
.. _amd-megatron-lm-requirements-v2412:
|
||||||
|
|
||||||
Download the Docker image and required packages
|
Download the Docker image and required packages
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
@@ -271,10 +272,10 @@ end-of-document token, remove sentence splitting, and use the tokenizer type.
|
|||||||
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
|
In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
|
||||||
``my-gpt2_text_document.idx``.
|
``my-gpt2_text_document.idx``.
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
|
.. image:: /data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup-v2412:
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
-----------------
|
-----------------
|
||||||
@@ -374,19 +375,19 @@ Run benchmark tests
|
|||||||
|
|
||||||
NODE_RANK="${NODE_RANK:-0}"
|
NODE_RANK="${NODE_RANK:-0}"
|
||||||
|
|
||||||
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars-v2412>`).
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
{variables} bash examples/llama/train_llama2.sh
|
{variables} bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
|
* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars-v2412>`).
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
{variables} bash examples/llama/train_llama3.sh
|
{variables} bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
.. _amd-megatron-lm-benchmark-test-vars:
|
.. _amd-megatron-lm-benchmark-test-vars-v2412:
|
||||||
|
|
||||||
The benchmark tests support the same set of variables:
|
The benchmark tests support the same set of variables:
|
||||||
|
|
||||||
@@ -465,11 +466,11 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v2412>`.
|
||||||
|
|
||||||
See the sample output:
|
See the sample output:
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
.. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
.. tab-item:: Multi node training
|
.. tab-item:: Multi node training
|
||||||
@@ -494,18 +495,18 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v2412>`.
|
||||||
|
|
||||||
Sample output for 2-node training:
|
Sample output for 2-node training:
|
||||||
|
|
||||||
Master node:
|
Master node:
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/2-node-training-master.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Worker node:
|
Worker node:
|
||||||
|
|
||||||
.. image:: ../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
|
|||||||
@@ -111,9 +111,10 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
|
|||||||
|
|
||||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
|
||||||
See :ref:`mi300x-disable-numa` for more information.
|
See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
|
||||||
|
for more information.
|
||||||
|
|
||||||
.. _mi300x-amd-megatron-lm-training:
|
.. _mi300x-amd-megatron-lm-training-v253:
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
@@ -125,7 +126,7 @@ Use the following instructions to set up the environment, configure the script t
|
|||||||
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
|
||||||
image.
|
image.
|
||||||
|
|
||||||
.. _amd-megatron-lm-requirements:
|
.. _amd-megatron-lm-requirements-v253:
|
||||||
|
|
||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
@@ -151,7 +152,7 @@ Download the Docker image
|
|||||||
|
|
||||||
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
|
The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup-v253:
|
||||||
|
|
||||||
Configuration scripts
|
Configuration scripts
|
||||||
---------------------
|
---------------------
|
||||||
@@ -395,7 +396,7 @@ accelerators with the AMD Megatron-LM Docker image.
|
|||||||
Key options
|
Key options
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
.. _amd-megatron-lm-benchmark-test-vars:
|
.. _amd-megatron-lm-benchmark-test-vars-v253:
|
||||||
|
|
||||||
The benchmark tests support the following sets of variables:
|
The benchmark tests support the following sets of variables:
|
||||||
|
|
||||||
@@ -485,11 +486,11 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v253>`.
|
||||||
|
|
||||||
See the sample output:
|
See the sample output:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
.. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
.. tab-item:: Multi-node training
|
.. tab-item:: Multi-node training
|
||||||
@@ -514,18 +515,18 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v253>`.
|
||||||
|
|
||||||
Sample output for 2-node training:
|
Sample output for 2-node training:
|
||||||
|
|
||||||
Master node:
|
Master node:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Worker node:
|
Worker node:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
|
|||||||
@@ -90,21 +90,21 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
|
|||||||
Some models, such as Llama, require an external license agreement through
|
Some models, such as Llama, require an external license agreement through
|
||||||
a third party (for example, Meta).
|
a third party (for example, Meta).
|
||||||
|
|
||||||
.. _amd-megatron-lm-performance-measurements:
|
.. _amd-megatron-lm-performance-measurements-v254:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
|
||||||
page provides reference throughput and latency measurements for training
|
page provides reference throughput and latency measurements for training
|
||||||
popular AI models.
|
popular AI models.
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
The performance data presented in
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
|
||||||
only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`_.
|
only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`.
|
||||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
System validation
|
System validation
|
||||||
@@ -115,7 +115,7 @@ auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
|
|||||||
and optimization steps <train-a-model-system-validation>` to set up your system
|
and optimization steps <train-a-model-system-validation>` to set up your system
|
||||||
before starting training.
|
before starting training.
|
||||||
|
|
||||||
.. _mi300x-amd-megatron-lm-training:
|
.. _mi300x-amd-megatron-lm-training-v254:
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
@@ -127,7 +127,7 @@ Use the following instructions to set up the environment, configure the script t
|
|||||||
reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
|
reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
|
||||||
image.
|
image.
|
||||||
|
|
||||||
.. _amd-megatron-lm-requirements:
|
.. _amd-megatron-lm-requirements-v254:
|
||||||
|
|
||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
@@ -154,7 +154,7 @@ Download the Docker image
|
|||||||
The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
|
The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
|
||||||
(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
|
(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup-v254:
|
||||||
|
|
||||||
Configuration scripts
|
Configuration scripts
|
||||||
---------------------
|
---------------------
|
||||||
@@ -468,7 +468,7 @@ accelerators with the AMD Megatron-LM Docker image.
|
|||||||
Key options
|
Key options
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
.. _amd-megatron-lm-benchmark-test-vars:
|
.. _amd-megatron-lm-benchmark-test-vars-v254:
|
||||||
|
|
||||||
The benchmark tests support the following sets of variables:
|
The benchmark tests support the following sets of variables:
|
||||||
|
|
||||||
@@ -568,11 +568,11 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v254>`.
|
||||||
|
|
||||||
See the sample output:
|
See the sample output:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
.. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
.. tab-item:: Multi-node training
|
.. tab-item:: Multi-node training
|
||||||
@@ -597,18 +597,18 @@ Benchmarking examples
|
|||||||
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
|
||||||
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
|
You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup-v254>`.
|
||||||
|
|
||||||
Sample output for 2-node training:
|
Sample output for 2-node training:
|
||||||
|
|
||||||
Master node:
|
Master node:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Worker node:
|
Worker node:
|
||||||
|
|
||||||
.. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
|
.. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
|
||||||
:width: 800
|
:width: 800
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
|
|||||||
@@ -0,0 +1,775 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
|
|
||||||
|
******************************************
|
||||||
|
Training a model with Megatron-LM for ROCm
|
||||||
|
******************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm Megatron-LM
|
||||||
|
training performance documentation. See :doc:`../megatron-lm` for the latest version.
|
||||||
|
|
||||||
|
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||||
|
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||||
|
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||||
|
Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
|
||||||
|
scalability, performance, and resource utilization for AI workloads. It is
|
||||||
|
purpose-built to support models like Llama, DeepSeek, and Mixtral,
|
||||||
|
enabling developers to train next-generation AI models more
|
||||||
|
efficiently.
|
||||||
|
|
||||||
|
AMD provides a ready-to-use Docker image for MI300X series accelerators containing
|
||||||
|
essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
||||||
|
utilities. It contains the following software components to accelerate training
|
||||||
|
workloads:
|
||||||
|
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Software component | Version |
|
||||||
|
+==========================+================================+
|
||||||
|
| ROCm | 6.3.4 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| PyTorch | 2.8.0a0+gite2f9759 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Python | 3.12 or 3.10 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Transformer Engine | 1.13.0+bb061ade |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Flash Attention | 3.0.0 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| hipBLASLt | 0.13.0-4f18bf6 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Triton | 3.3.0 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| RCCL | 2.22.3 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
|
Megatron-LM provides the following key features to train large language models efficiently:
|
||||||
|
|
||||||
|
- Transformer Engine (TE)
|
||||||
|
|
||||||
|
- APEX
|
||||||
|
|
||||||
|
- GEMM tuning
|
||||||
|
|
||||||
|
- Torch.compile
|
||||||
|
|
||||||
|
- 3D parallelism: TP + SP + CP
|
||||||
|
|
||||||
|
- Distributed optimizer
|
||||||
|
|
||||||
|
- Flash Attention (FA) 3
|
||||||
|
|
||||||
|
- Fused kernels
|
||||||
|
|
||||||
|
- Pre-training
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-model-support-v255:
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
|
||||||
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
|
{% set model_groups = data["megatron-lm_benchmark"].model_groups %}
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-performance-measurements-v255:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
|
||||||
|
page provides reference throughput and latency measurements for training
|
||||||
|
popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
|
||||||
|
only reflects the latest version of this training benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. _mi300x-amd-megatron-lm-training-v255:
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
|
reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
|
||||||
|
image.
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-requirements-v255:
|
||||||
|
|
||||||
|
Download the Docker image
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: Ubuntu 24.04 + Python 3.12
|
||||||
|
:sync: py312
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull rocm/megatron-lm:v25.5_py312
|
||||||
|
|
||||||
|
.. tab-item:: Ubuntu 22.04 + Python 3.10
|
||||||
|
:sync: py310
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull rocm/megatron-lm:v25.5_py310
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: Ubuntu 24.04 + Python 3.12
|
||||||
|
:sync: py312
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
|
||||||
|
|
||||||
|
|
||||||
|
.. tab-item:: Ubuntu 22.04 + Python 3.10
|
||||||
|
:sync: py310
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310
|
||||||
|
|
||||||
|
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start megatron_training_env
|
||||||
|
docker exec -it megatron_training_env bash
|
||||||
|
|
||||||
|
The Docker container includes a pre-installed, verified version of the ROCm
|
||||||
|
Megatron-LM development branch
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
|
||||||
|
training scripts.
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-environment-setup-v255:
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
|
||||||
|
directory of
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||||
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
|
||||||
|
directory of
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||||
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
|
||||||
|
directory of
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
|
||||||
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||||
|
directory of
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
|
||||||
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
|
||||||
|
directory of
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
|
||||||
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v255>` for more information on configuration options.
|
||||||
|
|
||||||
|
Network interface
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Update the network interface in the script to match your system's network interface. To
|
||||||
|
find your network interface, run the following (outside of any Docker container):
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
ip a
|
||||||
|
|
||||||
|
Look for an active interface that has an IP address in the same subnet as
|
||||||
|
your other nodes. Then, update the following variables in the script, for
|
||||||
|
example:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||||
|
|
||||||
|
export GLOO_SOCKET_IFNAME=ens50f0np0
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-tokenizer-v255:
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
---------
|
||||||
|
|
||||||
|
You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
|
||||||
|
If the tokenizer is not found, it'll be downloaded if publicly available.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
If you do not have Llama 3.3 tokenizer locally, you need to use your
|
||||||
|
personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer.
|
||||||
|
See `Llama-3.3-70B-Instruct
|
||||||
|
<https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_. After you are
|
||||||
|
authorized, use your ``HF_TOKEN`` to download the tokenizer and set the
|
||||||
|
variable ``TOKENIZER_MODEL`` to the tokenizer path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export HF_TOKEN=<Your personal Hugging Face access token>
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="meta-llama/Llama-3.1-70B"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
Download the Mixtral tokenizer.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
mkdir tokenizer
|
||||||
|
cd tokenizer
|
||||||
|
export HF_TOKEN=<Your personal Hugging Face access token>
|
||||||
|
wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model
|
||||||
|
|
||||||
|
Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TOKENIZER_MODEL=tokenizer/tokenizer.model
|
||||||
|
|
||||||
|
Dataset options
|
||||||
|
---------------
|
||||||
|
|
||||||
|
You can use either mock data or real data for training.
|
||||||
|
|
||||||
|
* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
|
||||||
|
value is ``1`` for enabled.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=1
|
||||||
|
|
||||||
|
* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=0
|
||||||
|
|
||||||
|
DATA_PATH="/data/bookcorpus_text_sentence" # Change to where your dataset is stored
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
Download the dataset
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
For Llama models, use the `prepare_dataset.sh
|
||||||
|
<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
|
||||||
|
to prepare your dataset.
|
||||||
|
To download the dataset, set the ``DATASET`` variable to the dataset you'd
|
||||||
|
like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and
|
||||||
|
``DATASET=bookcorpus``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset
|
||||||
|
DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset
|
||||||
|
|
||||||
|
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
|
||||||
|
Remember to either pre-download the tokenizer or setup Hugging Face access
|
||||||
|
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v255>` section.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx``
|
||||||
|
as in the following example:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
If you don't already have the dataset, download the DeepSeek dataset using the following
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
mkdir deepseek-datasets
|
||||||
|
cd deepseek-datasets
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
||||||
|
|
||||||
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=0 # Train on real data
|
||||||
|
|
||||||
|
DATA_DIR="<path-to>/deepseek-datasets" # Change to where your dataset is stored
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
If you don't already have the dataset, download the DeepSeek dataset using the following
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
mkdir deepseek-datasets
|
||||||
|
cd deepseek-datasets
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
|
||||||
|
|
||||||
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=0 # Train on real data
|
||||||
|
|
||||||
|
DATA_DIR="<path-to>/deepseek-datasets" # Change to where your dataset is stored
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
If you don't already have the dataset, download the Mixtral dataset using the following
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
mkdir mixtral-datasets
|
||||||
|
cd mixtral-datasets
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin
|
||||||
|
wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx
|
||||||
|
|
||||||
|
To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
MOCK_DATA=0 # Train on real data
|
||||||
|
|
||||||
|
DATA_DIR="<path-to>/mixtral-datasets" # Change to where your dataset is stored
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
Multi-node configuration
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
If you're running multi-node training, update the following environment variables. They can
|
||||||
|
also be passed as command line arguments. Refer to the following example configurations.
|
||||||
|
|
||||||
|
* Change ``localhost`` to the master node's hostname:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||||
|
|
||||||
|
* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES="${NNODES:-1}"
|
||||||
|
|
||||||
|
* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NODE_RANK="${NODE_RANK:-0}"
|
||||||
|
|
||||||
|
* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
|
||||||
|
NFS directory) for multi-node runs:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
|
||||||
|
|
||||||
|
* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
|
||||||
|
inside a Docker container, either install the drivers inside the Docker container or pass the network
|
||||||
|
drivers from the host while creating the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Specify which RDMA interfaces to use for communication
|
||||||
|
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||||
|
|
||||||
|
Getting started
|
||||||
|
===============
|
||||||
|
|
||||||
|
The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
|
||||||
|
system performance, conduct training benchmarks, and achieve superior
|
||||||
|
performance for models like Llama, DeepSeek, and Mixtral. This container should not be
|
||||||
|
expected to provide generalized performance across all training workloads. You
|
||||||
|
can expect the container to perform in the model configurations described in
|
||||||
|
the following section, but other configurations are not validated by AMD.
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-run-training-v255:
|
||||||
|
|
||||||
|
Run training
|
||||||
|
------------
|
||||||
|
|
||||||
|
Use the following example commands to set up the environment, configure
|
||||||
|
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v255>`, and run training on
|
||||||
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
|
Single node training
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
|
||||||
|
For example, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It is suggested to use ``TP=1`` when FSDP is enabled for higher
|
||||||
|
throughput. FSDP-v2 is not supported with pipeline parallelism, expert
|
||||||
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
|
or FP16.
|
||||||
|
|
||||||
|
Currently, FSDP is only compatible with BF16 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
|
||||||
|
following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
|
||||||
|
For example, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It is suggested to use ``TP=1`` when FSDP is enabled for higher
|
||||||
|
throughput. FSDP-v2 is not supported with pipeline parallelism, expert
|
||||||
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
|
or FP16.
|
||||||
|
|
||||||
|
Currently, FSDP is only compatible with BF16 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the
|
||||||
|
following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
|
For Llama 2 7B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
|
||||||
|
For example, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It is suggested to use ``TP=1`` when FSDP is enabled for higher
|
||||||
|
throughput. FSDP-v2 is not supported with pipeline parallelism, expert
|
||||||
|
parallelism, MCore's distributed optimizer, gradient accumulation fusion,
|
||||||
|
or FP16.
|
||||||
|
|
||||||
|
Currently, FSDP is only compatible with BF16 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
|
navigate to the Megatron-LM folder and use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
FORCE_BANLANCE=true \
|
||||||
|
RUN_ENV=cluster \
|
||||||
|
MODEL_SIZE=671B \
|
||||||
|
TRAIN_ITERS=50 \
|
||||||
|
SEQ_LEN=4096 \
|
||||||
|
NUM_LAYERS=3 \
|
||||||
|
MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \
|
||||||
|
PR=bf16 \
|
||||||
|
TP=1 PP=1 ETP=1 EP=8 \
|
||||||
|
GEMM_TUNING=1 \
|
||||||
|
NVTE_CK_USES_BWD_V3=1 \
|
||||||
|
USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \
|
||||||
|
GPT_LAYER_IN_TE=true \
|
||||||
|
bash examples/deepseek_v3/train_deepseekv3.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||||
|
navigate to the Megatron-LM folder and use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
|
navigate to the Megatron-LM folder and use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||||
|
navigate to the Megatron-LM folder and use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
|
||||||
|
|
||||||
|
Multi-node training
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
To run training on multiple nodes, launch the Docker container on each node.
|
||||||
|
For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
|
||||||
|
use these commands.
|
||||||
|
|
||||||
|
* On the master node ``NODE0``:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
* On the worker node ``NODE1``:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
|
||||||
|
|
||||||
|
Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
|
||||||
|
provided in
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to
|
||||||
|
enable training at scale under a SLURM environment. For example, to run
|
||||||
|
training on 16 nodes, try the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
|
||||||
|
|
||||||
|
.. _amd-megatron-lm-benchmark-test-vars-v255:
|
||||||
|
|
||||||
|
Key options
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The benchmark tests support the following sets of variables.
|
||||||
|
|
||||||
|
``TEE_OUTPUT``
|
||||||
|
``1`` to enable training logs or ``0`` to disable.
|
||||||
|
|
||||||
|
``TE_FP8``
|
||||||
|
``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
|
||||||
|
|
||||||
|
``GEMM_TUNING``
|
||||||
|
``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
|
||||||
|
|
||||||
|
``USE_FLASH_ATTN``
|
||||||
|
``1`` to enable Flash Attention.
|
||||||
|
|
||||||
|
``FSDP``
|
||||||
|
``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
|
||||||
|
``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled.
|
||||||
|
|
||||||
|
``ENABLE_PROFILING``
|
||||||
|
``1`` to enable PyTorch profiling for performance analysis.
|
||||||
|
|
||||||
|
``transformer-impl``
|
||||||
|
``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
|
||||||
|
|
||||||
|
``MODEL_SIZE``
|
||||||
|
``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example.
|
||||||
|
|
||||||
|
``TOTAL_ITERS``
|
||||||
|
The total number of iterations -- ``10`` by default.
|
||||||
|
|
||||||
|
``MOCK_DATA``
|
||||||
|
``1`` to use mock data or ``0`` to use real data you provide.
|
||||||
|
|
||||||
|
``MBS``
|
||||||
|
Micro batch size.
|
||||||
|
|
||||||
|
``BS``
|
||||||
|
Global batch size.
|
||||||
|
|
||||||
|
``TP`` / ``TP_SIZE``
|
||||||
|
Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
|
||||||
|
|
||||||
|
``EP`` / ``EP_SIZE``
|
||||||
|
Expert parallel for MoE models.
|
||||||
|
|
||||||
|
``SEQ_LENGTH``
|
||||||
|
Input sequence length.
|
||||||
|
|
||||||
|
``PR``
|
||||||
|
Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
|
||||||
|
|
||||||
|
``AC``
|
||||||
|
Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
|
||||||
|
|
||||||
|
``NUM_LAYERS``
|
||||||
|
Use reduced number of layers as a proxy model.
|
||||||
|
|
||||||
|
``RECOMPUTE_NUM_LAYERS``
|
||||||
|
Number of layers used for checkpointing recompute.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`megatron-lm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/megatron-lm`` Docker image.
|
||||||
@@ -11,37 +11,39 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
:stub-columns: 1
|
|
||||||
|
|
||||||
* - Image version
|
* - Image version
|
||||||
- ROCm version
|
- Components
|
||||||
- PyTorch version
|
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.6
|
* - v25.6
|
||||||
- 6.3.4
|
-
|
||||||
- 2.8.0a0+git7d205b2
|
* ROCm 6.3.4
|
||||||
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../pytorch-training>`
|
* :doc:`Documentation <../pytorch-training>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
||||||
|
|
||||||
* - v25.5
|
* - v25.5
|
||||||
- 6.3.4
|
-
|
||||||
- 2.7.0a0+git637433
|
* ROCm 6.3.4
|
||||||
|
* PyTorch 2.7.0a0+git637433
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <pytorch-training-v25.5>`
|
* :doc:`Documentation <pytorch-training-v25.5>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`__
|
||||||
|
|
||||||
* - v25.4
|
* - v25.4
|
||||||
- 6.3.0
|
-
|
||||||
- 2.7.0a0+git637433
|
* ROCm 6.3.0
|
||||||
|
* PyTorch 2.7.0a0+git637433
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <pytorch-training-v25.4>`
|
* :doc:`Documentation <pytorch-training-v25.4>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`__
|
||||||
|
|
||||||
* - v25.3
|
* - v25.3
|
||||||
- 6.3.0
|
-
|
||||||
- 2.7.0a0+git637433
|
* ROCm 6.3.0
|
||||||
|
* PyTorch 2.7.0a0+git637433
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <pytorch-training-v25.3>`
|
* :doc:`Documentation <pytorch-training-v25.3>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`__
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ software components to accelerate training workloads:
|
|||||||
| Triton | 3.1 |
|
| Triton | 3.1 |
|
||||||
+--------------------------+--------------------------------+
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support:
|
.. _amd-pytorch-training-model-support-v253:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -80,7 +80,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.
|
|||||||
|
|
||||||
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
|
||||||
|
|
||||||
See :ref:`mi300x-disable-numa` for more information.
|
See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
|
||||||
|
for more information.
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ software components to accelerate training workloads:
|
|||||||
| Triton | 3.1 |
|
| Triton | 3.1 |
|
||||||
+--------------------------+--------------------------------+
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support:
|
.. _amd-pytorch-training-model-support-v254:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -61,7 +61,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
Some models, such as Llama 3, require an external license agreement through
|
Some models, such as Llama 3, require an external license agreement through
|
||||||
a third party (for example, Meta).
|
a third party (for example, Meta).
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements:
|
.. _amd-pytorch-training-performance-measurements-v254:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to train a model using PyTorch for ROCm.
|
:description: How to train a model using PyTorch for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||||
@@ -38,7 +40,7 @@ software components to accelerate training workloads:
|
|||||||
| Triton | 3.2.0 |
|
| Triton | 3.2.0 |
|
||||||
+--------------------------+--------------------------------+
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support:
|
.. _amd-pytorch-training-model-support-v255:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -62,7 +64,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
Some models, such as Llama 3, require an external license agreement through
|
Some models, such as Llama 3, require an external license agreement through
|
||||||
a third party (for example, Meta).
|
a third party (for example, Meta).
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements:
|
.. _amd-pytorch-training-performance-measurements-v255:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ myst:
|
|||||||
"description": "Learn about system settings and performance tuning for RDNA2-based GPUs."
|
"description": "Learn about system settings and performance tuning for RDNA2-based GPUs."
|
||||||
"keywords": "RDNA2, workstation, desktop, BIOS, installation, Radeon, pro, v620, w6000"
|
"keywords": "RDNA2, workstation, desktop, BIOS, installation, Radeon, pro, v620, w6000"
|
||||||
---
|
---
|
||||||
|
:orphan:
|
||||||
|
|
||||||
# AMD RDNA2 system optimization
|
# AMD RDNA2 system optimization
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
|
:description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
|
||||||
:keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning
|
:keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning
|
||||||
@@ -12,8 +14,7 @@ accelerators. They include detailed instructions on system settings and
|
|||||||
application tuning suggestions to help you fully leverage the capabilities of
|
application tuning suggestions to help you fully leverage the capabilities of
|
||||||
these accelerators, thereby achieving optimal performance.
|
these accelerators, thereby achieving optimal performance.
|
||||||
|
|
||||||
* :doc:`../../rocm-for-ai/inference/vllm-benchmark`
|
* :doc:`/how-to/rocm-for-ai/inference-optimization/workload`
|
||||||
* :doc:`../../rocm-for-ai/inference-optimization/workload`
|
|
||||||
* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
* `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -215,9 +215,9 @@ sphinx==8.1.3
|
|||||||
# sphinx-copybutton
|
# sphinx-copybutton
|
||||||
# sphinx-design
|
# sphinx-design
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
|
# sphinx-last-updated-by-git
|
||||||
# sphinx-notfound-page
|
# sphinx-notfound-page
|
||||||
# sphinx-reredirects
|
# sphinx-reredirects
|
||||||
# sphinx-sitemap
|
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
# sphinxcontrib-runcmd
|
# sphinxcontrib-runcmd
|
||||||
sphinx-book-theme==1.1.4
|
sphinx-book-theme==1.1.4
|
||||||
@@ -228,11 +228,13 @@ sphinx-design==0.6.1
|
|||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-external-toc==1.0.1
|
sphinx-external-toc==1.0.1
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
|
sphinx-last-updated-by-git==0.3.8
|
||||||
|
# via sphinx-sitemap
|
||||||
sphinx-notfound-page==1.1.0
|
sphinx-notfound-page==1.1.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-reredirects==0.1.6
|
sphinx-reredirects==0.1.6
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
sphinx-sitemap==2.6.0
|
sphinx-sitemap==2.7.2
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
sphinxcontrib-applehelp==2.0.0
|
sphinxcontrib-applehelp==2.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ System Management
|
|||||||
.. csv-table::
|
.. csv-table::
|
||||||
:header: "Component", "Description"
|
:header: "Component", "Description"
|
||||||
|
|
||||||
":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
|
":doc:`AMD SMI <amdsmi:index>`", "System management interface to control AMD GPU settings, monitor performance, and retrieve device and process information"
|
||||||
":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
|
":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
|
||||||
":doc:`rocminfo <rocminfo:index>`", "Reports system information"
|
":doc:`rocminfo <rocminfo:index>`", "Reports system information"
|
||||||
":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
|
":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
|
||||||
|
|||||||
Reference in New Issue
Block a user