Compare commits

..

35 Commits

Author SHA1 Message Date
Matt Williams
b3211cc6fa Updating broken link (#5258) 2025-09-05 16:02:13 -04:00
Peter Park
5853468fca Update PyTorch training benchmark docker doc to 25.7 (#5255) (#5260)
* Update PyTorch training benchmark docker doc to 25.7

* update .wordlist.txt

* update conf.py

* update data sheet

* fix sphinx warnings
2025-09-05 12:14:09 -04:00
Matt Williams
245c95690f Merge pull request #5228 from ROCm/cherry-pick-link-fix
Fix hyperlink syntax
2025-08-26 11:42:57 -04:00
Dominic Widdows
39c1b926f6 Fix hyperlink syntax 2025-08-26 11:35:30 -04:00
Jeffrey Novotny
3c3847f9f7 Merge pull request #5224 from amd-jnovotny/dlf-matt-docs643
Deep learning frameworks edits for scale (#5189)
2025-08-22 11:56:27 -04:00
Matt Williams
249bd177ec Deep learning frameworks edits for scale (#5189)
* Deep learning frameworks edits for scale

Based on https://ontrack-internal.amd.com/browse/ROCDOC-1809

* update table

table

* leo comments

* formatting

* format

* update table based on feedback

* header

* Update machine learning page

* headers

* Apply suggestions from code review

Co-authored-by: anisha-amd <anisha.sankar@amd.com>

* Update .wordlist.txt

* formatting

* Update docs/how-to/deep-learning-rocm.rst

Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>

---------

Co-authored-by: Matt Williams <Matt.Williams+amdeng@amd.com>
Co-authored-by: anisha-amd <anisha.sankar@amd.com>
Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>
(cherry picked from commit 1d42f7cc62)
2025-08-22 11:52:32 -04:00
Peter Park
b2ee8d4b2e docs: Add Primus (Megatron) training Docker documentation (#5218) (#5222)
(cherry picked from commit 98029db4ee)
2025-08-22 09:02:40 -04:00
Peter Park
3f834cf520 Fix documented VRAM for Radeon AI Pro R9700 (#5203) (#5204)
(cherry picked from commit c154b7e0a3)
2025-08-18 10:20:22 -04:00
Peter Park
70ba866c5b vLLM inference benchmark doc: add missing data field (#5199) (#5200)
(cherry picked from commit 55d0a88ec5)
2025-08-15 13:52:51 -04:00
Peter Park
320ec4669a [docs/6.4.3] Update vLLM benchmark doc for 20250812 Docker release (#5196) (#5198)
(cherry picked from commit 7ee22790ce)
2025-08-14 15:51:58 -04:00
anisha-amd
c9bd93b537 [Docs] 6.4.3: compatibility matrix frameworks support update (#5186) 2025-08-12 14:25:45 -04:00
Peter Park
a060550bcd Add Hunyuan Video to PyTorch inference benchmark models doc (#5094)
(cherry picked from commit 80f7dc79b9)
2025-08-12 11:59:51 -04:00
Parag Bhandari
c92cbaee66 Merge branch 'roc-6.4.x' into docs/6.4.3 2025-08-08 08:49:53 -04:00
Parag Bhandari
c84afacc8d Merge branch 'develop' into roc-6.4.x 2025-08-08 08:49:39 -04:00
Parag Bhandari
843fd1b3fb Merge branch 'roc-6.4.x' into docs/6.4.3 2025-08-08 07:21:31 -04:00
Parag Bhandari
82221c4e2d Merge branch 'develop' into roc-6.4.x 2025-08-08 07:18:53 -04:00
pbhandar-amd
d0ebe126e7 Sync develop into docs/6.4.3 2025-08-07 09:07:38 -04:00
pbhandar-amd
74610893a9 Merge pull request #5154 from ROCm/amd/pbhandar/merge_from_develop
Merge develop into docs/6.4.3
2025-08-06 11:25:50 -04:00
Parag Bhandari
afe3e21cad Merge branch 'develop' into docs/6.4.3 2025-08-05 16:24:07 -04:00
Alex Xu
ae2440772f upgrade rocm-docs-core to 1.22.0 2025-07-31 16:41:40 -04:00
anisha-amd
61f970a24d Cherry pick into 6.4.3 - Docs: Adding frameworks compatibility for Megablocks and Taichi (#5138) 2025-07-31 14:02:09 -04:00
Alex Xu
85a1682573 Merge branch 'develop' into roc-6.4.x 2025-07-21 17:22:49 -04:00
Alex Xu
87c6e320b4 Merge branch 'develop' into roc-6.4.x 2025-07-21 15:52:30 -04:00
ammallya
b50948fe6b Fix for rocrsamples and rocr_debug_agent (#4863)
* Fix for rocrsamples

* Fix for rocr_debug_agent
2025-05-30 16:27:29 -07:00
ammallya
91407405a9 Changed naming convention for hip (#4837)
* Changed naming convention for hip

* Changed naming convention for hip
2025-05-29 10:19:28 -07:00
ammallya
8f23f63a6b Fix for tests (#4818)
* Fix for RBT

* Fix for roctst and kfd test
2025-05-27 17:38:48 -07:00
Alex Xu
11747aaadc Merge branch 'develop' into roc-6.4.x 2025-05-21 15:04:02 -04:00
Alex Xu
1088beefe5 Merge branch 'develop' into roc-6.4.x 2025-05-21 12:27:13 -04:00
Alex Xu
b7988925a5 Merge branch 'develop' into roc-6.4.x 2025-05-21 12:25:30 -04:00
chiranjeevipattigidi
89dafa6232 Update packages - remove broken packages (#4758)
* Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove

aqlprofiletest

* Update packages - remove broken packages
2025-05-21 09:06:39 -07:00
chiranjeevipattigidi
8054852dad Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove (#4755)
aqlprofiletest
2025-05-20 07:59:07 -07:00
ammallya
542d7813ce Removing aqlprofiletest 2025-04-14 15:26:24 -07:00
ammallya
bc1ffe4fcb bypass tests 2025-04-14 13:41:34 -07:00
ammallya
09997c68bb Removing kfd test 2025-04-14 12:55:13 -07:00
ammallya
42bc3501ac Merge pull request #4623 from ammallya/roc-6.4.x
Rebasing branch 6.4.x
2025-04-14 11:42:06 -07:00
56 changed files with 1510 additions and 1154 deletions

View File

@@ -1,29 +1,10 @@
parameters:
- name: componentName
type: string
default: hip_clr_combined
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -54,24 +35,93 @@ parameters:
type: object
default:
- llvm-project
- ROCR-Runtime
# hip and clr are tightly-coupled
# run this same template for both repos
# any changes for clr should just trigger HIP pipeline
# similarly for hipother repo, for Nvidia backend
- name: jobMatrix
type: object
default:
buildJobs:
- { os: ubuntu2204, packageManager: apt, platform: amd }
- { os: ubuntu2204, packageManager: apt, platform: nvidia }
- { os: almalinux8, packageManager: dnf, platform: amd }
- { os: almalinux8, packageManager: dnf, platform: nvidia }
- { os: ubuntu2204, packageManager: apt }
- { os: almalinux8, packageManager: dnf }
# HIP with AMD backend
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: ${{ parameters.componentName }}_${{ job.os }}_${{ job.platform }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}
- job: hip_clr_combined_${{ job.os }}_amd
pool:
vmImage: 'ubuntu-22.04'
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
variables:
- group: common
- template: /.azuredevops/variables-global.yml
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
# checkout triggering repo (either HIP or clr)
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
# if this is triggered by HIP repo, matching repo is clr
# if this is triggered by clr repo, matching repo is HIP
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: matching_repo
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: hipother_repo
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependenciesAMD }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
os: ${{ job.os }}
# compile clr
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: clr
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
os: ${{ job.os }}
useAmdclang: false
extraBuildFlags: >-
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-DHIP_PLATFORM=amd
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-DCLR_BUILD_HIP=ON
-DCLR_BUILD_OCL=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
artifactName: amd
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: amd
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
# parameters:
# aptPackages: ${{ parameters.aptPackages }}
# pipModules: ${{ parameters.pipModules }}
# environment: amd
# HIP with Nvidia backend
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: hip_clr_combined_${{ job.os }}_nvidia
pool:
vmImage: 'ubuntu-22.04'
${{ if eq(job.os, 'almalinux8') }}:
@@ -90,45 +140,49 @@ jobs:
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
# full checkout of rocm-systems superrepo, we need clr, hip, and hipother
# checkout triggering repo (either HIP or clr)
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
# sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
# if this is triggered by HIP repo, matching repo is clr
# if this is triggered by clr repo, matching repo is HIP
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: matching_repo
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: hipother_repo
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependenciesNvidia }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
os: ${{ job.os }}
${{ if eq(job.platform, 'amd') }}:
dependencyList: ${{ parameters.rocmDependenciesAMD }}
${{ elseif eq(job.platform, 'nvidia') }}:
dependencyList: ${{ parameters.rocmDependenciesNvidia }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- script: 'ls -1R $(Agent.BuildDirectory)/rocm'
displayName: 'Artifact listing'
# compile clr
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
componentName: clr
cmakeBuildDir: $(Agent.BuildDirectory)/s/projects/clr/build
cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/clr
cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
os: ${{ job.os }}
useAmdclang: false
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-DHIP_PLATFORM=nvidia
-DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-DHIP_COMMON_DIR=$(Agent.BuildDirectory)/s/projects/hip
-DHIPNV_DIR=$(Agent.BuildDirectory)/s/projects/hipother/hipnv
-DHIP_PLATFORM=${{ job.platform }}
-DCLR_BUILD_HIP=ON
-DCLR_BUILD_OCL=ON
-DCLR_BUILD_OCL=OFF
-DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
artifactName: ${{ job.platform }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
artifactName: ${{ job.platform }}
artifactName: nvidia
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
# parameters:
# aptPackages: ${{ parameters.aptPackages }}
# pipModules: ${{ parameters.pipModules }}
# environment: nvidia

View File

@@ -123,7 +123,7 @@ jobs:
- template: /.azuredevops/variables-global.yml
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
pool: ${{ variables.MEDIUM_BUILD_POOL }}
pool: ${{ variables.HIGH_BUILD_POOL }}
workspace:
clean: all
steps:
@@ -131,7 +131,6 @@ jobs:
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
@@ -150,7 +149,6 @@ jobs:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- task: Bash@3
displayName: Build and install other dependencies
retryCountOnTaskFailure: 3
inputs:
targetType: inline
workingDirectory: $(Agent.BuildDirectory)/s
@@ -212,7 +210,6 @@ jobs:
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
@@ -231,7 +228,6 @@ jobs:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- task: Bash@3
displayName: Build and install other dependencies
retryCountOnTaskFailure: 3
inputs:
targetType: inline
workingDirectory: $(Agent.BuildDirectory)/s

View File

@@ -171,7 +171,6 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Wheel Files'
retryCountOnTaskFailure: 3
inputs:
itemPattern: '**/*${{ job.os }}*.whl'
targetPath: $(Agent.BuildDirectory)

View File

@@ -35,13 +35,9 @@ parameters:
- ccache
- gfortran
- git
- libboost-filesystem-dev
- libboost-program-options-dev
- libdrm-dev
- liblapack-dev
- libmsgpack-dev
- libnuma-dev
- libopenblas-dev
- ninja-build
- python3-pip
- python3-venv
@@ -50,12 +46,6 @@ parameters:
default:
- joblib
- "packaging>=22.0"
- pyyaml
- msgpack
- simplejson
- ujson
- orjson
- yappi
- --upgrade
- name: rocmDependencies
type: object
@@ -91,12 +81,12 @@ parameters:
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
#- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
#- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
@@ -179,7 +169,7 @@ jobs:
cd $(Agent.BuildDirectory)/temp-deps
# position-independent LAPACK is required for almalinux8 builds
cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
make -j
make
sudo make install
- script: |
mkdir -p $(CCACHE_DIR)
@@ -205,11 +195,7 @@ jobs:
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-DCMAKE_C_COMPILER_LAUNCHER=ccache
-DAMDGPU_TARGETS=${{ job.target }}
-DGPU_TARGETS=${{ job.target }}
-DBUILD_CLIENTS_TESTS=ON
-DHIPBLASLT_ENABLE_ROCROLLER=ON
-DHIPBLASLT_ENABLE_FETCH=ON
-DHIPBLASLT_ENABLE_BLIS=OFF
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:

View File

@@ -69,7 +69,7 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
#- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }

View File

@@ -113,8 +113,7 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
# ignore sparse checkout for monorepo case, we want access to hipblaslt directory
# sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
@@ -131,10 +130,7 @@ jobs:
displayName: Create temp folder for external dependencies
# hipSPARSELt already has a CMake script for external deps, so we can just run that
# https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
- ${{ if ne(parameters.sparseCheckoutDir, '') }}:
script: cmake $(Pipeline.Workspace)/s/projects/hipsparselt/deps
${{ else }}:
script: cmake $(Pipeline.Workspace)/s/deps
- script: cmake $(Pipeline.Workspace)/s/deps
displayName: Configure hipSPARSELt external dependencies
workingDirectory: $(Pipeline.Workspace)/deps
- script: make
@@ -158,11 +154,7 @@ jobs:
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_USE_LOCAL_TENSILE=OFF
-GNinja
${{ if ne(parameters.sparseCheckoutDir, '') }}:
cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
cmakeBuildDir: $(Build.SourcesDirectory)/projects/hipsparselt
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}

View File

@@ -30,7 +30,7 @@ parameters:
default:
buildJobs:
- { os: ubuntu2204, packageManager: apt }
# - { os: ubuntu2404, packageManager: apt }
- { os: ubuntu2404, packageManager: apt }
- { os: almalinux8, packageManager: dnf }
jobs:

View File

@@ -76,7 +76,7 @@ jobs:
- template: /.azuredevops/variables-global.yml
- name: HIP_ROCCLR_HOME
value: $(Build.BinariesDirectory)/rocm
pool: ${{ variables.MEDIUM_BUILD_POOL }}
pool: ${{ variables.HIGH_BUILD_POOL }}
workspace:
clean: all
steps:

View File

@@ -84,12 +84,12 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
#- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx942 }
- { os: almalinux8, packageManager: dnf, target: gfx90a }
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
#- { os: almalinux8, packageManager: dnf, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
@@ -115,13 +115,6 @@ parameters:
# buildDependsOn:
# - rocBLAS_build
# - rocPRIM_build
# temporary rocblas->hipblas downstream path while the SOLVERs are disabled
- hipBLAS:
name: hipBLAS
sparseCheckoutDir: projects/hipblas
skipUnifiedBuild: 'false'
buildDependsOn:
- rocBLAS_build
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:

View File

@@ -190,7 +190,6 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Wheel Files'
retryCountOnTaskFailure: 3
inputs:
itemPattern: '**/*.whl'
targetPath: $(Agent.BuildDirectory)

View File

@@ -74,12 +74,12 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
#- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx942 }
- { os: almalinux8, packageManager: dnf, target: gfx90a }
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
#- { os: almalinux8, packageManager: dnf, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }

View File

@@ -73,7 +73,7 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
#- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }

View File

@@ -70,7 +70,7 @@ jobs:
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ variables.MEDIUM_BUILD_POOL }}
pool: ${{ variables.HIGH_BUILD_POOL }}
workspace:
clean: all
steps:

View File

@@ -36,10 +36,8 @@ parameters:
- gfortran
- git
- libdrm-dev
- liblapack-dev
- libmsgpack-dev
- libnuma-dev
- libopenblas-dev
- ninja-build
- python3-pip
- python3-venv
@@ -48,8 +46,6 @@ parameters:
default:
- joblib
- "packaging>=22.0"
- pytest
- pytest-cmake
- --upgrade
- name: rocmDependencies
type: object
@@ -102,12 +98,12 @@ jobs:
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
@@ -138,26 +134,12 @@ jobs:
rocm-libraries | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
rocm-libraries | ${{ job.os }} | ${{ job.target }}
rocm-libraries | ${{ job.os }}
- task: Bash@3
displayName: Add paths for CMake and Python site-packages binaries
inputs:
targetType: inline
script: |
USER_BASE=$(python3 -m site --user-base)
echo "##vso[task.prependpath]$USER_BASE/bin"
echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
displayName: Set cmake configure paths
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
extraBuildFlags: >-
-D CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor;$(PytestCmakePath)
-D CMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
-D CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-D CMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-D CMAKE_CXX_COMPILER_LAUNCHER=ccache
-D CMAKE_C_COMPILER_LAUNCHER=ccache
-G Ninja
-DROCM_LIBRARIES_SUPERBUILD=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}

View File

@@ -1,29 +1,10 @@
parameters:
- name: componentName
type: string
default: rocprofiler-compute
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -97,10 +78,6 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: rocprofiler_compute_build_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}_${{ job.target }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -117,19 +94,15 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
# - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -138,83 +111,78 @@ jobs:
# pipModules: ${{ parameters.pipModules }}
# gpuTarget: ${{ job.target }}
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocprofiler_compute_test_${{ job.target }}
timeoutInMinutes: 120
dependsOn: rocprofiler_compute_build_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: PYTHON_VERSION
value: 3.10
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- task: Bash@3
displayName: Add en_US.UTF-8 locale
inputs:
targetType: inline
script: |
sudo locale-gen en_US.UTF-8
sudo update-locale
locale -a
- task: Bash@3
displayName: Add ROCm binaries to PATH
inputs:
targetType: inline
script: |
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DENABLE_TESTS=ON
-DINSTALL_TESTS=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: rocprofiler_compute_test_${{ job.target }}
timeoutInMinutes: 120
dependsOn: rocprofiler_compute_build_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: PYTHON_VERSION
value: 3.10
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
- task: Bash@3
displayName: Add en_US.UTF-8 locale
inputs:
targetType: inline
script: |
sudo locale-gen en_US.UTF-8
sudo update-locale
locale -a
- task: Bash@3
displayName: Add ROCm binaries to PATH
inputs:
targetType: inline
script: |
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
extraBuildFlags: >-
-DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_BUILD_TYPE=Release
-DENABLE_TESTS=ON
-DINSTALL_TESTS=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofiler-compute
testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}

View File

@@ -8,22 +8,6 @@ parameters:
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -86,10 +70,6 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}_${{ job.target }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -114,7 +94,6 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
parameters:
dependencyList:
@@ -129,8 +108,6 @@ jobs:
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
@@ -138,7 +115,6 @@ jobs:
extraBuildFlags: >-
-DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
-DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DENABLE_LDCONFIG=OFF
-DUSE_PROF_API=1
@@ -146,13 +122,10 @@ jobs:
multithreadFlag: -- -j32
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -166,68 +139,63 @@ jobs:
- HIP_ROCCLR_HOME:::/home/user/workspace/rocm
- ROCM_PATH:::/home/user/workspace/rocm
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
- name: LD_LIBRARY_PATH
value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- checkout: none
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofilerV1
testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
testExecutable: ./run.sh
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofilerV2
testDir: $(Agent.BuildDirectory)/rocm
testExecutable: share/rocprofiler/tests/runUnitTests
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
- name: ROCM_PATH
value: $(Agent.BuildDirectory)/rocm
- name: LD_LIBRARY_PATH
value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
parameters:
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofilerV1
testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
testExecutable: ./run.sh
testParameters: ''
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: rocprofilerV2
testDir: $(Agent.BuildDirectory)/rocm
testExecutable: share/rocprofiler/tests/runUnitTests
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}

View File

@@ -8,22 +8,6 @@ parameters:
- name: checkoutRef
type: string
default: ''
# monorepo related parameters
- name: sparseCheckoutDir
type: string
default: ''
- name: triggerDownstreamJobs
type: boolean
default: false
- name: downstreamAggregateNames
type: string
default: ''
- name: buildDependsOn
type: object
default: null
- name: unifiedBuild
type: boolean
default: false
# set to true if doing full build of ROCm stack
# and dependencies are pulled from same pipeline
- name: aggregatePipeline
@@ -81,10 +65,6 @@ parameters:
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
${{ if parameters.buildDependsOn }}:
dependsOn:
- ${{ each build in parameters.buildDependsOn }}:
- ${{ build }}_${{ job.os }}_${{ job.target }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -107,7 +87,6 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
parameters:
checkoutRepo: ${{ parameters.checkoutRepo }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
@@ -115,8 +94,6 @@ jobs:
gpuTarget: ${{ job.target }}
aggregatePipeline: ${{ parameters.aggregatePipeline }}
os: ${{ job.os }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
# the linker flags will not affect ubuntu2204 builds as the paths do not exist
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
@@ -132,13 +109,10 @@ jobs:
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
parameters:
componentName: ${{ parameters.componentName }}
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
componentName: ${{ parameters.componentName }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -149,57 +123,53 @@ jobs:
# gpuTarget: ${{ job.target }}
# registerROCmPackages: true
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
preTargetFilter: ${{ parameters.componentName }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: ${{ parameters.componentName }}
testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
testParameters: ''
testDir: $(Agent.BuildDirectory)
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}
registerROCmPackages: true
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
eq(${{ parameters.aggregatePipeline }}, False)
)
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool: ${{ job.target }}_test_pool
workspace:
clean: all
steps:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
packageManager: ${{ job.packageManager }}
registerROCmPackages: true
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
parameters:
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
parameters:
checkoutRef: ${{ parameters.checkoutRef }}
dependencyList: ${{ parameters.rocmTestDependencies }}
gpuTarget: ${{ job.target }}
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
parameters:
componentName: roctracer
testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
testParameters: ''
testDir: $(Agent.BuildDirectory)
testPublishResults: false
os: ${{ job.os }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
pipModules: ${{ parameters.pipModules }}
environment: test
gpuTarget: ${{ job.target }}
registerROCmPackages: true

View File

@@ -1,67 +0,0 @@
parameters:
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
- name: fmtlibVersion
type: string
default: ''
- name: aptPackages
type: object
default:
- cmake
- git
- ninja-build
- libfmt-dev
- name: jobMatrix
type: object
default:
buildJobs:
- { os: ubuntu2204, packageManager: apt}
- { os: almalinux8, packageManager: dnf}
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: fmtlib_${{ job.os }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool:
vmImage: 'ubuntu-22.04'
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
workspace:
clean: all
steps:
- checkout: none
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- task: Bash@3
displayName: Clone fmtlib ${{ parameters.fmtlibVersion }}
inputs:
targetType: inline
script: git clone https://github.com/fmtlib/fmt.git -b ${{ parameters.fmtlibVersion }}
workingDirectory: $(Agent.BuildDirectory)
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
cmakeBuildDir: $(Agent.BuildDirectory)/fmt/build
cmakeSourceDir: $(Agent.BuildDirectory)/fmt
useAmdclang: false
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DFMT_SYSTEM_HEADERS=ON
-DFMT_INSTALL=ON
-DFMT_TEST=OFF
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
os: ${{ job.os }}

View File

@@ -1,71 +0,0 @@
parameters:
- name: checkoutRepo
type: string
default: 'self'
- name: checkoutRef
type: string
default: ''
- name: spdlogVersion
type: string
default: ''
- name: aptPackages
type: object
default:
- cmake
- git
- ninja-build
- name: jobMatrix
type: object
default:
buildJobs:
- { os: ubuntu2204, packageManager: apt}
- { os: almalinux8, packageManager: dnf}
jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
- job: spdlog_${{ job.os }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
pool:
vmImage: 'ubuntu-22.04'
${{ if eq(job.os, 'almalinux8') }}:
container:
image: rocmexternalcicd.azurecr.io/manylinux228:latest
endpoint: ContainerService3
workspace:
clean: all
steps:
- checkout: none
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
parameters:
aptPackages: ${{ parameters.aptPackages }}
packageManager: ${{ job.packageManager }}
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
parameters:
dependencyList:
- fmtlib
- task: Bash@3
displayName: Clone spdlog ${{ parameters.spdlogVersion }}
inputs:
targetType: inline
script: git clone https://github.com/gabime/spdlog.git -b ${{ parameters.spdlogVersion }}
workingDirectory: $(Agent.BuildDirectory)
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
cmakeBuildDir: $(Agent.BuildDirectory)/spdlog/build
cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
useAmdclang: false
extraBuildFlags: >-
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/vendor
-DCMAKE_BUILD_TYPE=Release
-DSPDLOG_USE_STD_FORMAT=OFF
-DSPDLOG_FMT_EXTERNAL_HO=ON
-DSPDLOG_INSTALL=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
parameters:
os: ${{ job.os }}

View File

@@ -397,7 +397,6 @@ jobs:
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Wheel Files'
retryCountOnTaskFailure: 3
inputs:
itemPattern: '**/*.whl'
targetPath: $(Agent.BuildDirectory)

View File

@@ -93,7 +93,7 @@ schedules:
jobs:
- ${{ each job in parameters.jobList }}:
- job: nightly_${{ job.os }}_${{ job.target }}
timeoutInMinutes: 120
timeoutInMinutes: 90
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -226,7 +226,6 @@ jobs:
cat Dockerfile
- task: Docker@2
displayName: Build and upload Docker image
retryCountOnTaskFailure: 3
inputs:
containerRegistry: ContainerService3
repository: 'nightly-${{ job.os }}-${{ job.target }}'

View File

@@ -1,23 +0,0 @@
variables:
- group: common
- template: /.azuredevops/variables-global.yml
parameters:
- name: fmtlibVersion
type: string
default: "11.1.3"
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
trigger: none
pr: none
jobs:
- template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
parameters:
fmtlibVersion: ${{ parameters.fmtlibVersion }}

View File

@@ -1,23 +0,0 @@
variables:
- group: common
- template: /.azuredevops/variables-global.yml
parameters:
- name: spdlogVersion
type: string
default: "v1.15.1"
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
trigger: none
pr: none
jobs:
- template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
parameters:
spdlogVersion: ${{ parameters.spdlogVersion }}

View File

@@ -24,12 +24,8 @@ parameters:
steps:
- task: DownloadPipelineArtifact@2
displayName: Download ${{ parameters.componentName }}
retryCountOnTaskFailure: 3
inputs:
${{ if eq(parameters.componentName, 'clr') }}:
itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*amd*' # filter out nvidia clr artifacts
${{ else }}:
itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
targetPath: '$(Pipeline.Workspace)/d'
allowPartiallySucceededBuilds: true
${{ if parameters.aggregatePipeline }}:

View File

@@ -10,7 +10,6 @@ steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
- task: Bash@3
displayName: 'Register AMDGPU & ROCm repos (apt)'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |
@@ -21,8 +20,7 @@ steps:
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt update
- task: Bash@3
displayName: 'APT update and install packages'
retryCountOnTaskFailure: 3
displayName: 'sudo apt-get update'
inputs:
targetType: inline
script: |
@@ -30,6 +28,15 @@ steps:
echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update && \
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install && \
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
- task: Bash@3
displayName: 'sudo apt-get fix'
inputs:
targetType: inline
script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
- ${{ if gt(length(parameters.aptPackages), 0) }}:
- task: Bash@3
displayName: 'sudo apt-get install ...'
inputs:
targetType: inline
script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}

View File

@@ -5,28 +5,51 @@ parameters:
steps:
- task: Bash@3
displayName: Download and install aqlprofile
retryCountOnTaskFailure: 3
displayName: Get aqlprofile package name
inputs:
targetType: inline
${{ if eq(parameters.os, 'ubuntu2204') }}:
script: |
export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
${{ if eq(parameters.os, 'almalinux8') }}:
script: |
export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
- task: Bash@3
displayName: 'Download aqlprofile'
inputs:
targetType: inline
workingDirectory: '$(Pipeline.Workspace)'
${{ if eq(parameters.os, 'ubuntu2204') }}:
script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
${{ if eq(parameters.os, 'almalinux8') }}:
script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
- task: Bash@3
displayName: 'Extract aqlprofile'
inputs:
targetType: inline
workingDirectory: '$(Pipeline.Workspace)'
${{ if eq(parameters.os, 'ubuntu2204') }}:
script: |
mkdir hsa-amd-aqlprofile
dpkg-deb -R $(packageName) hsa-amd-aqlprofile
${{ if eq(parameters.os, 'almalinux8') }}:
script: |
mkdir hsa-amd-aqlprofile
sudo dnf -y install rpm-build cpio
rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
- task: Bash@3
displayName: 'Copy aqlprofile files'
inputs:
targetType: inline
workingDirectory: $(Agent.BuildDirectory)
script: |
set -e
if [ "${{ parameters.os }}" = "ubuntu2204" ]; then
packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") && \
wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$packageName && \
mkdir -p hsa-amd-aqlprofile && \
dpkg-deb -R $packageName hsa-amd-aqlprofile
elif [ "${{ parameters.os }}" = "almalinux8" ]; then
sudo dnf -y install rpm-build cpio && \
packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) && \
wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$packageName && \
mkdir -p hsa-amd-aqlprofile && \
rpm2cpio $packageName | (cd hsa-amd-aqlprofile && cpio -idmv)
else
echo "Unsupported OS: ${{ parameters.os }}"
exit 1
fi && \
mkdir -p $(Agent.BuildDirectory)/rocm && \
cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm && \
rm -rf hsa-amd-aqlprofile $packageName
mkdir -p $(Agent.BuildDirectory)/rocm
cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm
workingDirectory: '$(Pipeline.Workspace)'
- task: Bash@3
displayName: 'Clean up aqlprofile'
inputs:
targetType: inline
script: rm -rf hsa-amd-aqlprofile $(packageName)
workingDirectory: '$(Pipeline.Workspace)'

View File

@@ -54,13 +54,11 @@ parameters:
libfftw3-dev: fftw-devel
libfmt-dev: fmt-devel
libgmp-dev: gmp-devel
liblapack-dev: lapack-devel
liblzma-dev: xz-devel
libmpfr-dev: mpfr-devel
libmsgpack-dev: msgpack-devel
libncurses5-dev: ncurses-devel
libnuma-dev: numactl-devel
libopenblas-dev: openblas-devel
libopenmpi-dev: openmpi-devel
libpci-dev: libpciaccess-devel
libssl-dev: openssl-devel
@@ -89,7 +87,6 @@ steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
- task: Bash@3
displayName: 'Register AMDGPU & ROCm repos (dnf)'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |
@@ -110,13 +107,12 @@ steps:
sudo dnf makecache
- task: Bash@3
displayName: 'Install base dnf packages'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |
sudo dnf config-manager --set-enabled powertools
# rpm fusion free repo for some dependencies
sudo dnf config-manager --set-enabled powertools && \
sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm && \
sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
- task: Bash@3
displayName: 'Check gcc environment'
@@ -130,7 +126,6 @@ steps:
g++ -print-file-name=libstdc++.so
- task: Bash@3
displayName: 'Set python 3.11 as default'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |
@@ -145,20 +140,18 @@ steps:
- ${{ if eq(pkg, 'ninja-build') }}:
- task: Bash@3
displayName: 'Install ninja 1.11.1'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |
sudo dnf -y install unzip && \
curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip && \
unzip ninja-linux.zip && \
sudo mv ninja /usr/local/bin/ninja && \
sudo chmod +x /usr/local/bin/ninja && \
curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
sudo dnf -y install unzip
unzip ninja-linux.zip
sudo mv ninja /usr/local/bin/ninja
sudo chmod +x /usr/local/bin/ninja
echo "##vso[task.prependpath]/usr/local/bin"
- ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
- task: Bash@3
displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: |

View File

@@ -27,7 +27,6 @@ steps:
- ${{ if gt(length(parameters.pipModules), 0) }}:
- task: Bash@3
displayName: 'pip install ...'
retryCountOnTaskFailure: 3
inputs:
targetType: inline
script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}

View File

@@ -47,8 +47,8 @@ parameters:
developBranch: aomp-dev
hasGpuTarget: false
clr:
pipelineId: 335
developBranch: develop
pipelineId: 145
developBranch: amd-staging
hasGpuTarget: false
composable_kernel:
pipelineId: 86
@@ -59,8 +59,8 @@ parameters:
developBranch: rocm
hasGpuTarget: false
HIP:
pipelineId: 335
developBranch: develop
pipelineId: 93
developBranch: amd-staging
hasGpuTarget: false
hip-tests:
pipelineId: 233
@@ -203,8 +203,8 @@ parameters:
developBranch: develop
hasGpuTarget: true
rocprofiler:
pipelineId: 329
developBranch: develop
pipelineId: 143
developBranch: amd-staging
hasGpuTarget: true
rocprofiler-compute:
pipelineId: 257

View File

@@ -8,18 +8,15 @@ parameters:
type: object
default:
boost: 250
fmtlib: 341
grpc: 72
gtest: 73
half560: 68
lapack: 69
spdlog: 340
steps:
- ${{ each dependency in parameters.dependencyList }}:
- task: DownloadPipelineArtifact@2
displayName: Download ${{ dependency }}
retryCountOnTaskFailure: 3
inputs:
project: ROCm-CI
buildType: specific

View File

@@ -33,7 +33,6 @@ parameters:
steps:
- task: DownloadPipelineArtifact@2
displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
retryCountOnTaskFailure: 3
inputs:
${{ if eq(parameters.buildType, 'specific') }}:
buildType: specific

View File

@@ -7,7 +7,6 @@ steps:
- task: Bash@3
name: downloadCKBuild
displayName: Download specific CK build
retryCountOnTaskFailure: 3
env:
CXX: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
CC: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
@@ -70,29 +69,20 @@ steps:
RETRIES=0
MAX_RETRIES=5
SUCCESS=false
while [ $RETRIES -lt $MAX_RETRIES ]; do
wget -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip && \
unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory) && \
mkdir -p $(Agent.BuildDirectory)/rocm && \
tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm && \
rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*
if [ $? -eq 0 ]; then
SUCCESS=true
echo "Successfully downloaded CK."
break
else
RETRIES=$((RETRIES + 1))
echo "Failed to download CK on attempt $RETRIES/$MAX_RETRIES, retrying..."
sleep 1
until wget -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip; do
RETRIES=$((RETRIES+1))
if [[ $RETRIES -ge $MAX_RETRIES ]]; then
echo "Failed to download CK artifact after $MAX_RETRIES attempts."
exit 1
fi
echo "Download failed, retrying ($RETRIES/$MAX_RETRIES)..."
sleep 5
done
if [ "$SUCCESS" = false ]; then
echo "ERROR: failed to download CK after $MAX_RETRIES attempts."
exit 1
fi
unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
mkdir -p $(Agent.BuildDirectory)/rocm
tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*
if [[ $EXIT_CODE -ne 0 ]]; then
BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')

View File

@@ -28,13 +28,13 @@ variables:
- name: GFX90A_TEST_POOL
value: gfx90a_test_pool
- name: LATEST_RELEASE_VERSION
value: 6.4.3
value: 6.4.2
- name: REPO_RADEON_VERSION
value: 6.4.3
value: 6.4.2
- name: NEXT_RELEASE_VERSION
value: 7.0.0
- name: LATEST_RELEASE_TAG
value: rocm-6.4.3
value: rocm-6.4.2
- name: DOCKER_SKIP_GFX
value: gfx90a
- name: COMPOSABLE_KERNEL_PIPELINE_ID

View File

@@ -918,6 +918,7 @@ toolchain
toolchains
toolset
toolsets
torchtitan
torchvision
tqdm
tracebacks

View File

@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.4.3"
<default revision="refs/tags/rocm-6.4.2"
remote="rocm-org"
sync-c="true"
sync-j="4" />

View File

@@ -29,7 +29,6 @@ additional licenses. Please review individual repositories for more information.
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE) |
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
@@ -47,6 +46,7 @@ additional licenses. Please review individual repositories for more information.
| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html) |
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
@@ -132,10 +132,12 @@ companies.
### Package licensing
:::{attention}
ROCprof Trace Decoder and AOCC CPU optimizations are provided in binary form, subject to the license agreement enclosed on [GitHub](https://github.com/ROCm/rocprof-trace-decoder/blob/amd-mainline/LICENSE) for ROCprof Trace Decoder, and [Developer Central](https://www.amd.com/en/developer/aocc.html) for AOCC. By using, installing,
copying or distributing ROCprof Trace Decoder or AOCC CPU Optimizations, you agree to
AQL Profiler and AOCC CPU optimization are both provided in binary form, each
subject to the license agreement enclosed in the directory for the binary available
in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
the terms and conditions of this license agreement. If you do not agree to the
terms of this agreement, do not install, copy or use ROCprof Trace Decoder or the
terms of this agreement, do not install, copy or use the AQL Profiler and/or the
AOCC CPU Optimizations.
:::

View File

@@ -9,21 +9,17 @@ import shutil
import sys
from pathlib import Path
gh_release_path = os.path.join("..", "RELEASE.md")
gh_changelog_path = os.path.join("..", "CHANGELOG.md")
sphinx_release_path = os.path.join("about", "release-notes.md")
sphinx_changelog_path = os.path.join("release", "changelog.md")
shutil.copy2(gh_release_path, sphinx_release_path)
shutil.copy2(gh_changelog_path, sphinx_changelog_path)
shutil.copy2("../RELEASE.md", "./about/release-notes.md")
shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
# Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
with open("./release/changelog.md", "r+") as file:
content = file.read()
file.seek(0)
file.write(":orphan:\n" + content)
# Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
with open("./release/changelog.md", "r") as file:
lines = file.readlines()
modified_lines = []
@@ -61,14 +57,11 @@ with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
file.close()
with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
with open("./release/changelog.md", 'w') as file:
file.writelines(modified_lines)
matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
if not os.path.exists(rtd_path):
os.makedirs(rtd_path)
shutil.copy2(matrix_path, rtd_path)
os.system("mkdir -p ../_readthedocs/html/downloads")
os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
latex_engine = "xelatex"
latex_elements = {
@@ -124,11 +117,15 @@ article_pages = [
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},

View File

@@ -28,31 +28,13 @@ See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/doc
Use the Python Virtual Environment (`venv`) and run the following commands from the project root:
::::{tab-set}
:::{tab-item} Linux and WSL
:sync: linux
```sh
python3 -mvenv .venv
.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
```
:::
:::{tab-item} Windows
:sync: windows
```powershell
python -mvenv .venv
.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
```
:::
::::
Navigate to `_build/html/index.html` and open this file in a web browser.
## Visual Studio Code

View File

@@ -0,0 +1,120 @@
unified_docker:
latest:
pull_tag: rocm/pytorch-training:v25.6
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
rocm_version: 6.4.1
pytorch_version: 2.8.0a0+git7d205b2
python_version: 3.10.17
transformer_engine_version: 1.14.0+2f85f5f2
flash_attention_version: 3.0.0.post1
hipblaslt_version: 0.15.0-8c6919d
triton_version: 3.3.0
model_groups:
- group: Pre-training
tag: pre-training
models:
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [pretrain]
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [pretrain]
- group: Fine-tuning
tag: fine-tuning
models:
- model: Llama 4 Scout 17B-16E
mad_tag: pyt_train_llama-4-scout-17b-16e
model_repo: Llama-4-17B_16E
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.3 70B
mad_tag: pyt_train_llama-3.3-70b
model_repo: Llama-3.3-70B
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.2 1B
mad_tag: pyt_train_llama-3.2-1b
model_repo: Llama-3.2-1B
url: https://huggingface.co/meta-llama/Llama-3.2-1B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 3B
mad_tag: pyt_train_llama-3.2-3b
model_repo: Llama-3.2-3B
url: https://huggingface.co/meta-llama/Llama-3.2-3B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 Vision 11B
mad_tag: pyt_train_llama-3.2-vision-11b
model_repo: Llama-3.2-Vision-11B
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.2 Vision 90B
mad_tag: pyt_train_llama-3.2-vision-90b
model_repo: Llama-3.2-Vision-90B
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.1 405B
mad_tag: pyt_train_llama-3.1-405b
model_repo: Llama-3.1-405B
url: https://huggingface.co/meta-llama/Llama-3.1-405B
precision: BF16
training_modes: [finetune_qlora, HF_finetune_lora]
- model: Llama 3 8B
mad_tag: pyt_train_llama-3-8b
model_repo: Llama-3-8B
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3 70B
mad_tag: pyt_train_llama-3-70b
model_repo: Llama-3-70B
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 7B
mad_tag: pyt_train_llama-2-7b
model_repo: Llama-2-7B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 2 13B
mad_tag: pyt_train_llama-2-13b
model_repo: Llama-2-13B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 70B
mad_tag: pyt_train_llama-2-70b
model_repo: Llama-2-70B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]

View File

@@ -1,38 +1,17 @@
unified_docker:
latest:
pull_tag: rocm/pytorch-training:v25.6
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
rocm_version: 6.4.1
pytorch_version: 2.8.0a0+git7d205b2
python_version: 3.10.17
transformer_engine_version: 1.14.0+2f85f5f2
flash_attention_version: 3.0.0.post1
hipblaslt_version: 0.15.0-8c6919d
triton_version: 3.3.0
dockers:
- pull_tag: rocm/pytorch-training:v25.7
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
components:
ROCm: 6.4.2
PyTorch: 2.8.0a0+gitd06a406
Python: 3.10.18
Transformer Engine: 2.2.0.dev0+94e53dd8
Flash Attention: 3.0.0.post1
hipBLASLt: 1.1.0-4b9a52edfc
Triton: 3.3.0
model_groups:
- group: Pre-training
tag: pre-training
models:
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [pretrain]
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [pretrain]
- group: Fine-tuning
tag: fine-tuning
- group: Meta Llama
tag: llama
models:
- model: Llama 4 Scout 17B-16E
mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
training_modes: [pretrain, finetune_fw, finetune_lora]
- model: Llama 3.1 405B
mad_tag: pyt_train_llama-3.1-405b
model_repo: Llama-3.1-405B
url: https://huggingface.co/meta-llama/Llama-3.1-405B
precision: BF16
training_modes: [finetune_qlora, HF_finetune_lora]
training_modes: [finetune_qlora]
- model: Llama 3 8B
mad_tag: pyt_train_llama-3-8b
model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
model_repo: Llama-2-70B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
training_modes: [finetune_lora, finetune_qlora]
- group: OpenAI
tag: openai
models:
- model: GPT OSS 20B
mad_tag: pyt_train_gpt_oss_20b
model_repo: GPT-OSS-20B
url: https://huggingface.co/openai/gpt-oss-20b
precision: BF16
training_modes: [HF_finetune_lora]
- model: GPT OSS 120B
mad_tag: pyt_train_gpt_oss_120b
model_repo: GPT-OSS-120B
url: https://huggingface.co/openai/gpt-oss-120b
precision: BF16
training_modes: [HF_finetune_lora]
- group: Qwen
tag: qwen
models:
- model: Qwen 3 8B
mad_tag: pyt_train_qwen3-8b
model_repo: Qwen3-8B
url: https://huggingface.co/Qwen/Qwen3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 3 32B
mad_tag: pyt_train_qwen3-32b
model_repo: Qwen3-32
url: https://huggingface.co/Qwen/Qwen3-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 32B
mad_tag: pyt_train_qwen2.5-32b
model_repo: Qwen2.5-32B
url: https://huggingface.co/Qwen/Qwen2.5-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 72B
mad_tag: pyt_train_qwen2.5-72b
model_repo: Qwen2.5-72B
url: https://huggingface.co/Qwen/Qwen2.5-72B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2 1.5B
mad_tag: pyt_train_qwen2-1.5b
model_repo: Qwen2-1.5B
url: https://huggingface.co/Qwen/Qwen2-1.5B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 2 7B
mad_tag: pyt_train_qwen2-7b
model_repo: Qwen2-7B
url: https://huggingface.co/Qwen/Qwen2-7B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- group: Flux
tag: flux
models:
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [pretrain]

View File

@@ -23,93 +23,92 @@ The table below summarizes information about ROCm-enabled deep learning framewor
- Installation options
- GitHub
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
- .. raw:: html
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
- .. raw:: html
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
- .. raw:: html
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
- .. raw:: html
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
- .. raw:: html
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
- .. raw:: html
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
- .. raw:: html
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
- .. raw:: html
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
-
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
- .. raw:: html
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
through the following guides.
@@ -124,10 +123,3 @@ through the following guides.
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

View File

@@ -939,7 +939,7 @@ hipBLASLt benchmarking
The GEMM library
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
provides a benchmark tool for its supported operations. Refer to the
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
for details.
* Example 1: Benchmark mix fp8 GEMM

View File

@@ -46,7 +46,7 @@ vLLM inference performance testing
- {{ unified_docker.hipblaslt_version }}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements>` for
inference performance numbers <vllm-benchmark-performance-measurements-715>` for
MI300X series accelerators.
What's new
@@ -219,7 +219,7 @@ system's configuration.
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
to collect latency and throughput performance data, you can also change the benchmarking
parameters. See the standalone benchmarking tab for more information.

View File

@@ -39,7 +39,7 @@ vLLM inference performance testing
- {{ unified_docker.hipblaslt_version }}
With this Docker image, you can quickly test the :ref:`expected
inference performance numbers <vllm-benchmark-performance-measurements>` for
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
MI300X series accelerators.
What's new
@@ -208,7 +208,7 @@ system's configuration.
and ``{{ model.mad_tag }}_serving.csv``.
Although the :ref:`available models
<vllm-benchmark-available-models>` are preconfigured to collect
<vllm-benchmark-available-models-812>` are preconfigured to collect
offline throughput and online serving performance data, you can
also change the benchmarking parameters. See the standalone
benchmarking tab for more information.

View File

@@ -22,9 +22,9 @@ If youre new to ROCm, refer to the :doc:`ROCm quick start install guide for L
<rocm-install-on-linux:install/quick-start>`.
If youre using a Radeon GPU for graphics-accelerated applications, refer to the
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
distribution's package manager. See the following documentation resources to get started:
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`

View File

@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
- Pre-training
.. _amd-megatron-lm-model-support:
.. _amd-megatron-lm-model-support-24-12:
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

View File

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
- Pre-training
.. _amd-megatron-lm-model-support:
.. _amd-megatron-lm-model-support-25-3:
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
.. tab-item:: Llama
:sync: llama
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
.. tab-item:: DeepSeek V2
:sync: deepseek
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
Multi-node training
^^^^^^^^^^^^^^^^^^^

View File

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
- Pre-training
.. _amd-megatron-lm-model-support:
.. _amd-megatron-lm-model-support-25-4:
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
.. tab-item:: Llama
:sync: llama
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
or the default ``HuggingFaceTokenizer``.
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
.. tab-item:: DeepSeek V2
:sync: deepseek
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
Multi-node training
^^^^^^^^^^^^^^^^^^^

View File

@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
- Components
- Resources
* - v25.7
-
* ROCm 6.4.2
* PyTorch 2.8.0a0+gitd06a406
-
* :doc:`Documentation <../pytorch-training>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
* - v25.6
-
* ROCm 6.3.4
* PyTorch 2.8.0a0+git7d205b2
-
* :doc:`Documentation <../pytorch-training>`
* :doc:`Documentation <pytorch-training-v25.6>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
* - v25.5

View File

@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
Previous versions
=================
See :doc:`pytorch-training-history` to find documentation for previous releases
of the ``ROCm/pytorch-training`` Docker image.

View File

@@ -0,0 +1,456 @@
:orphan:
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
**************************************
Training a model with PyTorch for ROCm
**************************************
.. caution::
This documentation does not reflect the latest version of ROCm vLLM
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
training workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| PyTorch | 2.8.0a0+git7d205b2 |
+--------------------------+--------------------------------+
| Python | 3.10.17 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.14.0+2f85f5f2 |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0.post1 |
+--------------------------+--------------------------------+
| hipBLASLt | 0.15.0-8c6919d |
+--------------------------+--------------------------------+
| Triton | 3.3.0 |
+--------------------------+--------------------------------+
.. _amd-pytorch-training-model-support-v256:
Supported models
================
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
{% set unified_docker = data.unified_docker.latest %}
{% set model_groups = data.model_groups %}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Workload</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. note::
Some models require an external license agreement through a third party (for example, Meta).
.. _amd-pytorch-training-performance-measurements-v256:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
.. note::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
Benchmarking
============
Once the setup is complete, choose between two options to start benchmarking:
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one GPU with the {{ model.precision }} data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{ model.mad_tag }} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv``.
{% endfor %}
{% endfor %}
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required packages
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
Run the Docker container.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
.. rubric:: Prepare training datasets and dependencies
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
./pytorch_benchmark_setup.sh
.. container:: model-doc pyt_train_llama-3.1-8b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
.. container:: model-doc pyt_train_llama-3.1-70b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``torchdata``
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``tomli``
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``tiktoken``
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``blobfile``
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``tabulate``
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``wandb``
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
.. container:: model-doc pyt_train_flux
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``csvkit``
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``deepspeed``
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``diffusers``
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``GitPython``
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``opencv-python-headless``
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``peft``
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``protobuf``
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
* - ``pytest``
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
* - ``python-dotenv``
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - ``seaborn``
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``transformers``
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
{% for model_group in model_groups %}
{% for model in model_group.models %}
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Pretraining
To start the pre-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
* - ``$datatype``
- ``BF16`` or ``FP8``
- Only Llama 3.1 8B supports FP8 precision.
{% else %}
* - ``$datatype``
- ``BF16``
- Only Llama 3.1 8B supports FP8 precision.
{% endif %}
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
{% if model.mad_tag == "pyt_train_flux" %}
.. container:: model-doc {{ model.mad_tag }}
.. note::
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
{% endif %}
{% endif %}
{% if model_group.tag == "fine-tuning" %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Fine-tuning
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
* - ``$training_mode``
- ``finetune_fw``
- Full weight fine-tuning (BF16 supported)
* -
- ``finetune_lora``
- LoRA fine-tuning (BF16 supported)
* -
- ``finetune_qlora``
- QLoRA fine-tuning (BF16 supported)
* -
- ``HF_finetune_lora``
- LoRA fine-tuning with Hugging Face PEFT
* - ``$datatype``
- ``BF16``
- All models support BF16.
* - ``$sequence_length``
- Between 2048 and 16384.
- Sequence length for the language model.
.. note::
{{ model.model }} currently supports the following fine-tuning methods:
{% for method in model.training_modes %}
* ``{{ method }}``
{% endfor %}
{% if model.training_modes|length < 4 %}
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
does not currently provide YAML configuration files for other combinations of
model to fine-tuning method
However, you can still configure your own YAML files to enable support for
fine-tuning methods not listed here by following existing patterns in the
``/workspace/torchtune/recipes/configs`` directory.
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
.. rubric:: Benchmarking examples
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
Further reading
===============
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================
See :doc:`pytorch-training-history` to find documentation for previous releases
of the ``ROCm/pytorch-training`` Docker image.

View File

@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
training workloads:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| PyTorch | 2.8.0a0+git7d205b2 |
+--------------------------+--------------------------------+
| Python | 3.10.17 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.14.0+2f85f5f2 |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0.post1 |
+--------------------------+--------------------------------+
| hipBLASLt | 0.15.0-8c6919d |
+--------------------------+--------------------------------+
| Triton | 3.3.0 |
+--------------------------+--------------------------------+
{% set dockers = data.dockers %}
{% set docker = dockers[0] %}
The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
(``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
training workloads:
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
.. _amd-pytorch-training-model-support:
@@ -38,26 +35,27 @@ Supported models
================
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
Some instructions, commands, and training recommendations in this documentation might
vary by model -- select one to get started.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
{% set unified_docker = data.unified_docker.latest %}
{% set unified_docker = data.dockers[0] %}
{% set model_groups = data.model_groups %}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Workload</div>
<div class="col-2 me-2 model-param-head">Model group</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="col-2 me-2 model-param-head">Model variant</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
</div>
</div>
.. note::
Some models require an external license agreement through a third party (for example, Meta).
.. _amd-pytorch-training-supported-training-modes:
.. _amd-pytorch-training-performance-measurements:
The following table lists supported training modes per model.
Performance measurements
========================
.. dropdown:: Supported training modes
To evaluate performance, the
.. list-table::
:header-rows: 1
* - Model
- Supported training modes
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
* - {{ model.model }}
- ``{{ model.training_modes | join('``, ``') }}``
{% endfor %}
{% endfor %}
.. note::
Some model and fine-tuning combinations are not listed. This is
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
doesn't provide default YAML configurations for them.
For advanced usage, you can create a custom configuration to enable
unlisted fine-tuning methods by using an existing file in the
``/workspace/torchtune/recipes/configs`` directory as a template.
.. _amd-pytorch-training-performance-measurements:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
.. note::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
.. note::
System validation
=================
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
System validation
=================
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt test configurations and run conditions outside those described.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Run training
============
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
Benchmarking
============
{% set unified_docker = data.dockers[0] %}
{% set model_groups = data.model_groups %}
Once the setup is complete, choose between two options to start benchmarking:
Once the setup is complete, choose between two options to start benchmarking training:
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one GPU with the {{ model.precision }} data type on the host machine.
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one node with the {{ model.precision }} data type on the host machine.
.. code-block:: shell
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{ model.mad_tag }} \
--keep-model-dir \
--live-output \
--timeout 28800
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{ model.mad_tag }} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv``.
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
model are collected in ``~/MAD/perf.csv``.
{% endfor %}
{% endfor %}
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
.. rubric:: Download the Docker image and required packages
Use the following command to pull the Docker image from Docker Hub.
1. Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
.. code-block:: shell
docker pull {{ unified_docker.pull_tag }}
docker pull {{ unified_docker.pull_tag }}
Run the Docker container.
2. Run the Docker container.
.. code-block:: shell
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ unified_docker.pull_tag }}
Use these commands if you exit the ``training_env`` container and need to return to it.
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
docker start training_env
docker exec -it training_env bash
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
.. code-block:: shell
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
.. rubric:: Prepare training datasets and dependencies
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
1. The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
export HF_TOKEN=$your_personal_hugging_face_access_token
Run the setup script to install libraries and datasets needed for benchmarking.
2. Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
.. code-block:: shell
./pytorch_benchmark_setup.sh
./pytorch_benchmark_setup.sh
.. container:: model-doc pyt_train_llama-3.1-8b
.. container:: model-doc pyt_train_llama-3.1-8b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
.. list-table::
:header-rows: 1
.. list-table::
:header-rows: 1
* - Library
- Reference
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
.. container:: model-doc pyt_train_llama-3.1-70b
.. container:: model-doc pyt_train_llama-3.1-70b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
.. list-table::
:header-rows: 1
.. list-table::
:header-rows: 1
* - Library
- Reference
* - Library
- Reference
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``torchdata``
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``torchdata``
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``tomli``
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``tomli``
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``tiktoken``
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``tiktoken``
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``blobfile``
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``blobfile``
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``tabulate``
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``tabulate``
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``wandb``
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``wandb``
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
.. container:: model-doc pyt_train_flux
.. container:: model-doc pyt_train_flux
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
.. list-table::
:header-rows: 1
.. list-table::
:header-rows: 1
* - Library
- Reference
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``csvkit``
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``csvkit``
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``deepspeed``
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``deepspeed``
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``diffusers``
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``diffusers``
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``GitPython``
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``GitPython``
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``opencv-python-headless``
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``opencv-python-headless``
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``peft``
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``peft``
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``protobuf``
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
* - ``protobuf``
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
* - ``pytest``
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
* - ``pytest``
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
* - ``python-dotenv``
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - ``python-dotenv``
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - ``seaborn``
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``seaborn``
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``transformers``
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
* - ``transformers``
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
{% for model_group in model_groups %}
{% for model in model_group.models %}
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
{% set training_modes = model.training_modes %}
{% set training_mode_descs = {
"pretrain": "Benchmark pre-training.",
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
} %}
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
{% if available_modes %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Pretraining
.. rubric:: Pre-training
To start the pre-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
* - ``$datatype``
- ``BF16`` or ``FP8``
- Only Llama 3.1 8B supports FP8 precision.
{% else %}
* - ``$datatype``
- ``BF16``
- Only Llama 3.1 8B supports FP8 precision.
{% endif %}
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
-m {{ model.model_repo }} \
-p $datatype \
-s $sequence_length
{% if model.mad_tag == "pyt_train_flux" %}
.. container:: model-doc {{ model.mad_tag }}
.. note::
Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
{% endif %}
{% endif %}
{% if model_group.tag == "fine-tuning" %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Fine-tuning
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
.. list-table::
:header-rows: 1
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
- Options
- Description
* - ``$training_mode``
- ``finetune_fw``
- Full weight fine-tuning (BF16 supported)
* -
- ``finetune_lora``
- LoRA fine-tuning (BF16 supported)
* -
- ``finetune_qlora``
- QLoRA fine-tuning (BF16 supported)
* -
- ``HF_finetune_lora``
- LoRA fine-tuning with Hugging Face PEFT
{% for mode in available_modes %}
* - {% if loop.first %}``$training_mode``{% endif %}
- ``{{ mode }}``
- {{ training_mode_descs[mode] }}
{% endfor %}
* - ``$datatype``
- ``BF16``
- All models support BF16.
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
- Only Llama 3.1 8B supports FP8 precision.
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
{% endif %}
{% set training_mode_descs = {
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
} %}
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
{% if available_modes %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Fine-tuning
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode \
-m {{ model.model_repo }} \
-p $datatype \
-s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% for mode in available_modes %}
* - {% if loop.first %}``$training_mode``{% endif %}
- ``{{ mode }}``
- {{ training_mode_descs[mode] }}
{% endfor %}
* - ``$datatype``
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
* - ``$sequence_length``
- Between 2048 and 16384.
- Sequence length for the language model.
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
.. note::
{{ model.model }} currently supports the following fine-tuning methods:
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
use the following torchtune commit for compatibility:
{% for method in model.training_modes %}
* ``{{ method }}``
{% endfor %}
{% if model.training_modes|length < 4 %}
.. code-block:: shell
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
.. note::
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
input tensor should be smaller than max_seq_len (4096)``.
This error indicates that an input sequence is longer than the model's maximum context window.
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
tokens in this case). You can resolve this by truncating the input or splitting
it into smaller chunks before passing it to the model.
Note on reproducibility: The results in this guide are based on
commit ``b4c98ac`` from the upstream
`<https://github.com/pytorch/torchtune>`__ repository. For the
latest updates, you can use the main branch.
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
does not currently provide YAML configuration files for other combinations of
model to fine-tuning method
However, you can still configure your own YAML files to enable support for
fine-tuning methods not listed here by following existing patterns in the
``/workspace/torchtune/recipes/configs`` directory.
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
.. rubric:: Benchmarking examples
.. rubric:: Benchmarking examples
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
Multi-node training
-------------------
Pre-training
~~~~~~~~~~~~
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
.. code-block:: shell
# In the MAD repository
cd scripts/pytorch_train
sbatch run_slurm_train.sh
Fine-tuning
~~~~~~~~~~~
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
.. code-block:: shell
huggingface-cli login # Get access to HF Llama model space
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
# In the MAD repository
cd scripts/pytorch_train
sbatch Torchtune_Multinode.sh
.. note::
Information regarding benchmark setup:
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
* You can adjust the torchtune `YAML configuration file
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
if you're using a different model.
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
* Set the ``mounting_paths`` inside the SLURM script.
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
Further reading
===============

View File

@@ -1,4 +1,4 @@
rocm-docs-core==1.20.1
rocm-docs-core==1.22.0
sphinx-reredirects
sphinx-sitemap
sphinxcontrib.datatemplates==0.11.0

View File

@@ -23,7 +23,7 @@ beautifulsoup4==4.13.4
# via pydata-sphinx-theme
breathe==4.36.0
# via rocm-docs-core
certifi==2025.4.26
certifi==2025.7.14
# via requests
cffi==1.17.1
# via
@@ -35,18 +35,16 @@ click==8.2.1
# via
# jupyter-cache
# sphinx-external-toc
comm==0.2.2
comm==0.2.3
# via ipykernel
cryptography==45.0.3
cryptography==45.0.5
# via pyjwt
debugpy==1.8.14
debugpy==1.8.15
# via ipykernel
decorator==5.2.1
# via ipython
defusedxml==0.7.1
# via sphinxcontrib-datatemplates
deprecated==1.2.18
# via pygithub
docutils==0.21.2
# via
# myst-parser
@@ -62,7 +60,7 @@ fastjsonschema==2.21.1
# rocm-docs-core
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
gitpython==3.1.45
# via rocm-docs-core
greenlet==3.2.3
# via sqlalchemy
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
# via
# jupyter-cache
# myst-nb
ipykernel==6.29.5
ipykernel==6.30.0
# via myst-nb
ipython==8.37.0
# via
@@ -86,7 +84,7 @@ jinja2==3.1.6
# via
# myst-parser
# sphinx
jsonschema==4.24.0
jsonschema==4.25.0
# via nbformat
jsonschema-specifications==2025.4.1
# via jsonschema
@@ -116,7 +114,7 @@ mdit-py-plugins==0.4.2
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==1.2.0
myst-nb==1.3.0
# via rocm-docs-core
myst-parser==4.0.1
# via myst-nb
@@ -134,7 +132,6 @@ nest-asyncio==1.6.0
packaging==25.0
# via
# ipykernel
# pydata-sphinx-theme
# sphinx
parso==0.8.4
# via jedi
@@ -152,13 +149,13 @@ pure-eval==0.2.3
# via stack-data
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.15.4
pydata-sphinx-theme==0.16.1
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.6.1
pygithub==2.7.0
# via rocm-docs-core
pygments==2.19.1
pygments==2.19.2
# via
# accessible-pygments
# ipython
@@ -178,7 +175,7 @@ pyyaml==6.0.2
# rocm-docs-core
# sphinx-external-toc
# sphinxcontrib-datatemplates
pyzmq==26.4.0
pyzmq==27.0.0
# via
# ipykernel
# jupyter-client
@@ -190,9 +187,9 @@ requests==2.32.4
# via
# pygithub
# sphinx
rocm-docs-core==1.20.1
rocm-docs-core==1.22.0
# via -r requirements.in
rpds-py==0.25.1
rpds-py==0.26.0
# via
# jsonschema
# referencing
@@ -220,7 +217,7 @@ sphinx==8.1.3
# sphinx-reredirects
# sphinxcontrib-datatemplates
# sphinxcontrib-runcmd
sphinx-book-theme==1.1.4
sphinx-book-theme==1.1.3
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
@@ -234,7 +231,7 @@ sphinx-notfound-page==1.1.0
# via rocm-docs-core
sphinx-reredirects==0.1.6
# via -r requirements.in
sphinx-sitemap==2.8.0
sphinx-sitemap==2.7.2
# via -r requirements.in
sphinxcontrib-applehelp==2.0.0
# via sphinx
@@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0
# via sphinxcontrib-datatemplates
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
sqlalchemy==2.0.41
sqlalchemy==2.0.42
# via jupyter-cache
stack-data==0.6.3
# via ipython
@@ -266,7 +263,6 @@ tornado==6.5.1
# jupyter-client
traitlets==5.14.3
# via
# comm
# ipykernel
# ipython
# jupyter-client
@@ -274,7 +270,7 @@ traitlets==5.14.3
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# beautifulsoup4
# exceptiongroup
@@ -290,7 +286,5 @@ urllib3==2.5.0
# requests
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.17.2
# via deprecated
zipp==3.23.0
# via importlib-metadata

View File

@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.4.3"
<default revision="refs/tags/rocm-6.4.2"
remote="rocm-org"
sync-c="true"
sync-j="4" />

View File

@@ -1,79 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<manifest>
<remote name="rocm-org" fetch="https://github.com/ROCm/" />
<default revision="refs/tags/rocm-6.4.3"
remote="rocm-org"
sync-c="true"
sync-j="4" />
<!--list of projects for ROCm-->
<project name="ROCm" revision="roc-6.4.x" />
<project name="ROCK-Kernel-Driver" />
<project name="ROCR-Runtime" />
<project name="amdsmi" />
<project name="rdc" />
<project name="rocm_bandwidth_test" />
<project name="rocm_smi_lib" />
<project name="rocm-core" />
<project name="rocm-examples" />
<project name="rocminfo" />
<project name="rocprofiler" />
<project name="rocprofiler-register" />
<project name="rocprofiler-sdk" />
<project name="rocprofiler-compute" />
<project name="rocprofiler-systems" />
<project name="roctracer" />
<!--HIP Projects-->
<project name="hip" />
<project name="hip-tests" />
<project name="HIPIFY" />
<project name="clr" />
<project name="hipother" />
<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
<project name="half" />
<project name="llvm-project" />
<project name="spirv-llvm-translator" />
<!-- gdb projects -->
<project name="ROCdbgapi" />
<project name="ROCgdb" />
<project name="rocr_debug_agent" />
<!-- ROCm Libraries -->
<project groups="mathlibs" name="AMDMIGraphX" />
<project groups="mathlibs" name="MIOpen" />
<project groups="mathlibs" name="MIVisionX" />
<project groups="mathlibs" name="ROCmValidationSuite" />
<project groups="mathlibs" name="Tensile" />
<project groups="mathlibs" name="composable_kernel" />
<project groups="mathlibs" name="hipBLAS-common" />
<project groups="mathlibs" name="hipBLAS" />
<project groups="mathlibs" name="hipBLASLt" />
<project groups="mathlibs" name="hipCUB" />
<project groups="mathlibs" name="hipFFT" />
<project groups="mathlibs" name="hipRAND" />
<project groups="mathlibs" name="hipSOLVER" />
<project groups="mathlibs" name="hipSPARSE" />
<project groups="mathlibs" name="hipSPARSELt" />
<project groups="mathlibs" name="hipTensor" />
<project groups="mathlibs" name="hipfort" />
<project groups="mathlibs" name="rccl" />
<project groups="mathlibs" name="rocAL" />
<project groups="mathlibs" name="rocALUTION" />
<project groups="mathlibs" name="rocBLAS" />
<project groups="mathlibs" name="rocDecode" />
<project groups="mathlibs" name="rocJPEG" />
<project groups="mathlibs" name="rocPyDecode" />
<project groups="mathlibs" name="rocFFT" />
<project groups="mathlibs" name="rocPRIM" />
<project groups="mathlibs" name="rocRAND" />
<project groups="mathlibs" name="rocSHMEM" />
<project groups="mathlibs" name="rocSOLVER" />
<project groups="mathlibs" name="rocSPARSE" />
<project groups="mathlibs" name="rocThrust" />
<project groups="mathlibs" name="rocWMMA" />
<project groups="mathlibs" name="rocm-cmake" />
<project groups="mathlibs" name="rpp" />
<project groups="mathlibs" name="TransferBench" />
<!-- Projects for OpenMP-Extras -->
<project name="aomp" path="openmp-extras/aomp" />
<project name="aomp-extras" path="openmp-extras/aomp-extras" />
<project name="flang" path="openmp-extras/flang" />
</manifest>