Compare commits

..

7 Commits

Author SHA1 Message Date
Mirza Halilcevic
9b102061f4 Add pybind11 as a pip module requirement for azure. 2025-06-24 08:06:52 -05:00
Daniel Su
f20e8dec8b [Ex CI] revert PRIM default branch to develop (#4960) 2025-06-23 16:35:02 -04:00
Daniel Su
10e9157f39 [Ex CI] allow rerun jobs to upload artifacts (#4959) 2025-06-23 15:37:52 -04:00
Daniel Su
a2ce6021cb [Ex CI] add more OSs to nightly build (#4958) 2025-06-23 15:13:11 -04:00
Peter Park
2196fc9a2f Fix pytorch training 25.6 doc (#4956)
* fix pytorch-training history

* fix pytorch-training

fix
2025-06-23 13:45:50 -04:00
Daniel Su
925689f89e [Ex CI] enable gfx1100 builds (#4954) 2025-06-23 11:26:35 -04:00
Peter Park
91a541f8b9 Update PyTorch training benchmark doc for v25.6 (#4950)
* update pytorch-training docker details

* add previous version

* add models data

* update models data id

* add models picker

* update data

* update fmt

fmt

* update data yaml

* update template

* update data

* fix

* fix vllm-0.6.4 broken link

* fix vllm history
2025-06-23 09:26:15 -04:00
78 changed files with 884 additions and 565 deletions

View File

@@ -61,12 +61,12 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx942 }
- { os: almalinux8, packageManager: dnf, target: gfx90a }
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }

View File

@@ -27,6 +27,7 @@ parameters:
- numpy
- tomli
- scipy
- pybind11
- name: rocmDependencies
type: object
default:

View File

@@ -60,12 +60,12 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx942 }
- { os: almalinux8, packageManager: dnf, target: gfx90a }
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
@@ -170,7 +170,7 @@ jobs:
- ${{ if eq(parameters.unifiedBuild, False) }}:
- ${{ each job in parameters.jobMatrix.testJobs }}:
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_${{ job.shard }}
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
condition:
and(succeeded(),

View File

@@ -64,12 +64,12 @@ parameters:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
- { os: almalinux8, packageManager: dnf, target: gfx942 }
- { os: almalinux8, packageManager: dnf, target: gfx90a }
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
testJobs:
- { os: ubuntu2204, packageManager: apt, target: gfx942 }

View File

@@ -3,12 +3,21 @@ parameters:
- name: jobList
type: object
default:
- gfx942-staging:
target: gfx942
source: staging
- gfx90a-staging:
target: gfx90a
source: staging
- { os: ubuntu2204, target: gfx942, source: staging }
- { os: ubuntu2204, target: gfx90a, source: staging }
- { os: ubuntu2204, target: gfx1201, source: staging }
- { os: ubuntu2204, target: gfx1100, source: staging }
- { os: ubuntu2204, target: gfx1030, source: staging }
- { os: ubuntu2404, target: gfx942, source: staging }
- { os: ubuntu2404, target: gfx90a, source: staging }
- { os: ubuntu2404, target: gfx1201, source: staging }
- { os: ubuntu2404, target: gfx1100, source: staging }
- { os: ubuntu2404, target: gfx1030, source: staging }
- { os: almalinux8, target: gfx942, source: staging }
- { os: almalinux8, target: gfx90a, source: staging }
- { os: almalinux8, target: gfx1201, source: staging }
- { os: almalinux8, target: gfx1100, source: staging }
- { os: almalinux8, target: gfx1030, source: staging }
- name: rocmDependencies
type: object
default:
@@ -16,9 +25,9 @@ parameters:
- amdsmi
- aomp-extras
- aomp
- clr
- composable_kernel
- half
- HIP
- hip-tests
- hipBLAS
- hipBLAS-common
@@ -83,7 +92,7 @@ schedules:
jobs:
- ${{ each job in parameters.jobList }}:
- job: rocm_nightly_${{ job.target }}_${{ job.source }}
- job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
variables:
- group: common
- template: /.azuredevops/variables-global.yml
@@ -108,9 +117,9 @@ jobs:
parameters:
dependencySource: ${{ job.source }}
dependencyList: ${{ parameters.rocmDependencies }}
os: ${{ job.os }}
gpuTarget: ${{ job.target }}
skipLibraryLinking: true
skipLlvmSymlink: true
- script: df -h
displayName: System disk space after ROCm
- script: du -sh $(Agent.BuildDirectory)/rocm

View File

@@ -26,7 +26,7 @@ steps:
includeRootFolder: false
archiveType: 'tar'
tarCompression: 'gz'
archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
- task: DeleteFiles@1
displayName: 'Cleanup Staging Area'
inputs:
@@ -38,7 +38,7 @@ steps:
inputs:
workingDirectory: $(Pipeline.Workspace)
targetType: inline
script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
# then publish it
- ${{ if parameters.publish }}:
- task: PublishPipelineArtifact@1
@@ -46,4 +46,5 @@ steps:
displayName: '${{ parameters.artifactName }} Publish'
retryCountOnTaskFailure: 3
inputs:
artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
targetPath: '$(Build.ArtifactStagingDirectory)'

View File

@@ -109,7 +109,7 @@ parameters:
hasGpuTarget: false
hipCUB:
pipelineId: $(HIPCUB_PIPELINE_ID)
stagingBranch: release-staging/rocm-rel-7.0
stagingBranch: develop
mainlineBranch: develop
hasGpuTarget: true
hipFFT:
@@ -129,7 +129,7 @@ parameters:
hasGpuTarget: false
hipRAND:
pipelineId: $(HIPRAND_PIPELINE_ID)
stagingBranch: release-staging/rocm-rel-7.0
stagingBranch: develop
mainlineBranch: develop
hasGpuTarget: true
hipSOLVER:
@@ -264,7 +264,7 @@ parameters:
hasGpuTarget: false
rocPRIM:
pipelineId: $(ROCPRIM_PIPELINE_ID)
stagingBranch: release-staging/rocm-rel-7.0
stagingBranch: develop
mainlineBranch: develop
hasGpuTarget: true
rocprofiler:
@@ -304,7 +304,7 @@ parameters:
hasGpuTarget: false
rocRAND:
pipelineId: $(ROCRAND_PIPELINE_ID)
stagingBranch: release-staging/rocm-rel-7.0
stagingBranch: develop
mainlineBranch: develop
hasGpuTarget: true
rocr_debug_agent:
@@ -329,7 +329,7 @@ parameters:
hasGpuTarget: false
rocThrust:
pipelineId: $(ROCTHRUST_PIPELINE_ID)
stagingBranch: release-staging/rocm-rel-7.0
stagingBranch: develop
mainlineBranch: develop
hasGpuTarget: true
roctracer:
@@ -438,14 +438,14 @@ steps:
targetType: inline
script: |
sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
- task: Bash@3
displayName: Symlink executables from rocm/llvm/bin to rocm/bin
inputs:
targetType: inline
script: |
for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
done
# dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
# the convention is as follows:

195
.gitmodules vendored
View File

@@ -1,195 +0,0 @@
[submodule "AMDMIGraphX"]
path = AMDMIGraphX
url = https://github.com/ROCm/AMDMIGraphX.git
[submodule "MIOpen"]
path = MIOpen
url = https://github.com/ROCm/MIOpen.git
[submodule "MIVisionX"]
path = MIVisionX
url = https://github.com/ROCm/MIVisionX.git
[submodule "ROCR-Runtime"]
path = ROCR-Runtime
url = https://github.com/ROCm/ROCR-Runtime.git
[submodule "ROCdbgapi"]
path = ROCdbgapi
url = https://github.com/ROCm/ROCdbgapi.git
[submodule "ROCgdb"]
path = ROCgdb
url = https://github.com/ROCm/ROCgdb.git
[submodule "ROCmValidationSuite"]
path = ROCmValidationSuite
url = https://github.com/ROCm/ROCmValidationSuite.git
[submodule "Tensile"]
path = Tensile
url = https://github.com/ROCm/Tensile.git
[submodule "TransferBench"]
path = TransferBench
url = https://github.com/ROCm/TransferBench.git
[submodule "amdsmi"]
path = amdsmi
url = https://github.com/ROCm/amdsmi.git
[submodule "openmp-extras/aomp"]
path = openmp-extras/aomp
url = https://github.com/ROCm/aomp.git
[submodule "openmp-extras/aomp-extras"]
path = openmp-extras/aomp-extras
url = https://github.com/ROCm/aomp-extras.git
[submodule "ROCm"]
path = ROCm
url = https://github.com/ROCm/ROCm.git
[submodule "clr"]
path = clr
url = https://github.com/ROCm/clr.git
[submodule "composable_kernel"]
path = composable_kernel
url = https://github.com/ROCm/composable_kernel.git
[submodule "rocm_bandwidth_test"]
path = rocm_bandwidth_test
url = https://github.com/ROCm/rocm_bandwidth_test.git
[submodule "openmp-extras/flang"]
path = openmp-extras/flang
url = https://github.com/ROCm/flang.git
[submodule "half"]
path = half
url = https://github.com/ROCm/half.git
[submodule "hip"]
path = hip
url = https://github.com/ROCm/hip.git
[submodule "hip-tests"]
path = hip-tests
url = https://github.com/ROCm/hip-tests.git
[submodule "hipBLAS"]
path = hipBLAS
url = https://github.com/ROCm/hipBLAS.git
[submodule "hipBLAS-common"]
path = hipBLAS-common
url = https://github.com/ROCm/hipBLAS-common.git
[submodule "hipBLASLt"]
path = hipBLASLt
url = https://github.com/ROCm/hipBLASLt.git
[submodule "hipCUB"]
path = hipCUB
url = https://github.com/ROCm/hipCUB.git
[submodule "hipFFT"]
path = hipFFT
url = https://github.com/ROCm/hipFFT.git
[submodule "hipRAND"]
path = hipRAND
url = https://github.com/ROCm/hipRAND.git
[submodule "hipSOLVER"]
path = hipSOLVER
url = https://github.com/ROCm/hipSOLVER.git
[submodule "hipSPARSE"]
path = hipSPARSE
url = https://github.com/ROCm/hipSPARSE.git
[submodule "hipSPARSELt"]
path = hipSPARSELt
url = https://github.com/ROCm/hipSPARSELt.git
[submodule "hipTensor"]
path = hipTensor
url = https://github.com/ROCm/hipTensor.git
[submodule "hipfort"]
path = hipfort
url = https://github.com/ROCm/hipfort.git
[submodule "HIPIFY"]
path = HIPIFY
url = https://github.com/ROCm/hipify.git
[submodule "hipother"]
path = hipother
url = https://github.com/ROCm/hipother.git
[submodule "llvm-project"]
path = llvm-project
url = https://github.com/ROCm/llvm-project.git
[submodule "rccl"]
path = rccl
url = https://github.com/ROCm/rccl.git
[submodule "rdc"]
path = rdc
url = https://github.com/ROCm/rdc.git
[submodule "rocAL"]
path = rocAL
url = https://github.com/ROCm/rocAL.git
[submodule "rocALUTION"]
path = rocALUTION
url = https://github.com/ROCm/rocALUTION.git
[submodule "rocBLAS"]
path = rocBLAS
url = https://github.com/ROCm/rocBLAS.git
[submodule "rocDecode"]
path = rocDecode
url = https://github.com/ROCm/rocDecode.git
[submodule "rocFFT"]
path = rocFFT
url = https://github.com/ROCm/rocFFT.git
[submodule "rocJPEG"]
path = rocJPEG
url = https://github.com/ROCm/rocJPEG.git
[submodule "rocPRIM"]
path = rocPRIM
url = https://github.com/ROCm/rocPRIM.git
[submodule "rocPyDecode"]
path = rocPyDecode
url = https://github.com/ROCm/rocPyDecode.git
[submodule "rocRAND"]
path = rocRAND
url = https://github.com/ROCm/rocRAND.git
[submodule "rocSHMEM"]
path = rocSHMEM
url = https://github.com/ROCm/rocSHMEM.git
[submodule "rocSOLVER"]
path = rocSOLVER
url = https://github.com/ROCm/rocSOLVER.git
[submodule "rocSPARSE"]
path = rocSPARSE
url = https://github.com/ROCm/rocSPARSE.git
[submodule "rocThrust"]
path = rocThrust
url = https://github.com/ROCm/rocThrust.git
[submodule "rocWMMA"]
path = rocWMMA
url = https://github.com/ROCm/rocWMMA.git
[submodule "rocm-cmake"]
path = rocm-cmake
url = https://github.com/ROCm/rocm-cmake.git
[submodule "rocm-core"]
path = rocm-core
url = https://github.com/ROCm/rocm-core.git
[submodule "rocm-examples"]
path = rocm-examples
url = https://github.com/ROCm/rocm-examples.git
[submodule "rocm_smi_lib"]
path = rocm_smi_lib
url = https://github.com/ROCm/rocm_smi_lib.git
[submodule "rocminfo"]
path = rocminfo
url = https://github.com/ROCm/rocminfo.git
[submodule "rocprofiler"]
path = rocprofiler
url = https://github.com/ROCm/rocprofiler.git
[submodule "rocprofiler-compute"]
path = rocprofiler-compute
url = https://github.com/ROCm/rocprofiler-compute.git
[submodule "rocprofiler-register"]
path = rocprofiler-register
url = https://github.com/ROCm/rocprofiler-register.git
[submodule "ROCK-Kernel-Driver"]
path = ROCK-Kernel-Driver
url = https://github.com/ROCm/ROCK-Kernel-Driver.git
[submodule "rocprofiler-sdk"]
path = rocprofiler-sdk
url = https://github.com/ROCm/rocprofiler-sdk.git
[submodule "rocprofiler-systems"]
path = rocprofiler-systems
url = https://github.com/ROCm/rocprofiler-systems.git
[submodule "rocr_debug_agent"]
path = rocr_debug_agent
url = https://github.com/ROCm/rocr_debug_agent.git
[submodule "roctracer"]
path = roctracer
url = https://github.com/ROCm/roctracer.git
[submodule "rpp"]
path = rpp
url = https://github.com/ROCm/rpp.git
[submodule "spirv-llvm-translator"]
path = spirv-llvm-translator
url = https://github.com/ROCm/spirv-llvm-translator.git

Submodule AMDMIGraphX deleted from 7a8103630c

1
HIPIFY

Submodule HIPIFY deleted from ed0de49132

1
MIOpen

Submodule MIOpen deleted from f10c6ed808

Submodule MIVisionX deleted from a2b69e5b30

Submodule ROCR-Runtime deleted from 890a339b49

Submodule ROCdbgapi deleted from 59be7ff0aa

1
ROCgdb

Submodule ROCgdb deleted from 401bb21f2f

1
ROCm

Submodule ROCm deleted from b50948fe6b

Submodule Tensile deleted from be49885fce

Submodule TransferBench deleted from 3ea2f226ec

1
amdsmi

Submodule amdsmi deleted from aca110192b

1
clr

Submodule clr deleted from a187df25c8

View File

@@ -0,0 +1,120 @@
unified_docker:
latest:
pull_tag: rocm/pytorch-training:v25.6
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
rocm_version: 6.4.1
pytorch_version: 2.8.0a0+git7d205b2
python_version: 3.10.17
transformer_engine_version: 1.14.0+2f85f5f2
flash_attention_version: 3.0.0.post1
hipblaslt_version: 0.15.0-8c6919d
triton_version: 3.3.0
model_groups:
- group: Pre-training
tag: pre-training
models:
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [pretrain]
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [pretrain]
- group: Fine-tuning
tag: fine-tuning
models:
- model: Llama 4 Scout 17B-16E
mad_tag: pyt_train_llama-4-scout-17b-16e
model_repo: Llama-4-17B_16E
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.3 70B
mad_tag: pyt_train_llama-3.3-70b
model_repo: Llama-3.3-70B
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.2 1B
mad_tag: pyt_train_llama-3.2-1b
model_repo: Llama-3.2-1B
url: https://huggingface.co/meta-llama/Llama-3.2-1B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 3B
mad_tag: pyt_train_llama-3.2-3b
model_repo: Llama-3.2-3B
url: https://huggingface.co/meta-llama/Llama-3.2-3B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 Vision 11B
mad_tag: pyt_train_llama-3.2-vision-11b
model_repo: Llama-3.2-Vision-11B
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.2 Vision 90B
mad_tag: pyt_train_llama-3.2-vision-90b
model_repo: Llama-3.2-Vision-90B
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.1 405B
mad_tag: pyt_train_llama-3.1-405b
model_repo: Llama-3.1-405B
url: https://huggingface.co/meta-llama/Llama-3.1-405B
precision: BF16
training_modes: [finetune_qlora, HF_finetune_lora]
- model: Llama 3 8B
mad_tag: pyt_train_llama-3-8b
model_repo: Llama-3-8B
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3 70B
mad_tag: pyt_train_llama-3-70b
model_repo: Llama-3-70B
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 7B
mad_tag: pyt_train_llama-2-7b
model_repo: Llama-2-7B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 2 13B
mad_tag: pyt_train_llama-2-13b
model_repo: Llama-2-13B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 70B
mad_tag: pyt_train_llama-2-70b
model_repo: Llama-2-70B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]

View File

@@ -18,11 +18,18 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
- PyTorch version
- Resources
* - 6.4.0
- 0.9.0.1
- 2.7.0
-
* :doc:`Documentation <../vllm>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
* - 6.3.1
- 0.8.5 (0.8.6.dev)
- 2.7.0
-
* :doc:`Documentation <../vllm>`
* :doc:`Documentation <vllm-0.8.5-20250521>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
* - 6.3.1
@@ -57,7 +64,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
- 0.6.4
- 2.5.0
-
* :doc:`Documentation <vllm-0.4.3>`
* :doc:`Documentation <vllm-0.6.4>`
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
* - 6.2.0

View File

@@ -18,11 +18,18 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
- PyTorch version
- Resources
* - v25.5
* - v25.6
- 6.3.4
- 0.4.35
- 2.8.0a0+git7d205b2
-
* :doc:`Documentation <../pytorch-training>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
* - v25.5
- 6.3.4
- 2.7.0a0+git637433
-
* :doc:`Documentation <pytorch-training-v25.5>`
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
* - v25.4

View File

@@ -0,0 +1,437 @@
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
**************************************
Training a model with PyTorch for ROCm
**************************************
.. caution::
This documentation does not reflect the latest version of ROCm vLLM
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
(``rocm/pytorch-training:v25.5``) image
provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
software components to accelerate training workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| PyTorch | 2.7.0a0+git637433 |
+--------------------------+--------------------------------+
| Python | 3.10 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.12.0.dev0+25a33da |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0 |
+--------------------------+--------------------------------+
| hipBLASLt | git53b53bf |
+--------------------------+--------------------------------+
| Triton | 3.2.0 |
+--------------------------+--------------------------------+
.. _amd-pytorch-training-model-support:
Supported models
================
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
* Llama 3.3 70B
* Llama 3.1 8B
* Llama 3.1 70B
* Llama 2 70B
* FLUX.1-dev
.. note::
Only these models are supported in the following steps.
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
.. _amd-pytorch-training-performance-measurements:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
.. note::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
Benchmarking
============
Once the setup is complete, choose between two options to start benchmarking:
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
using one GPU with the float16 data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
The available models for MAD-integrated benchmarking are:
* ``pyt_train_llama-3.3-70b``
* ``pyt_train_llama-3.1-8b``
* ``pyt_train_llama-3.1-70b``
* ``pyt_train_flux``
MAD launches a Docker container with the name
``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv``.
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required packages
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
docker pull rocm/pytorch-training:v25.5
Run the Docker container.
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
.. rubric:: Prepare training datasets and dependencies
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
./pytorch_benchmark_setup.sh
``pytorch_benchmark_setup.sh`` installs the following libraries:
.. list-table::
:header-rows: 1
* - Library
- Benchmark model
- Reference
* - ``accelerate``
- Llama 3.1 8B, FLUX
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- Llama 3.1 8B, 70B, FLUX
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``torchdata``
- Llama 3.1 70B
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``tomli``
- Llama 3.1 70B
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``tiktoken``
- Llama 3.1 70B
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``blobfile``
- Llama 3.1 70B
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``tabulate``
- Llama 3.1 70B
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``wandb``
- Llama 3.1 70B
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``sentencepiece``
- Llama 3.1 70B, FLUX
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``tensorboard``
- Llama 3.1 70 B, FLUX
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``csvkit``
- FLUX
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``deepspeed``
- FLUX
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``diffusers``
- FLUX
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``GitPython``
- FLUX
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``opencv-python-headless``
- FLUX
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``peft``
- FLUX
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``protobuf``
- FLUX
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
* - ``pytest``
- FLUX
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
* - ``python-dotenv``
- FLUX
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - ``seaborn``
- FLUX
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``transformers``
- FLUX
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
Along with the following datasets:
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
.. rubric:: Pretraining
To start the pretraining benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
* - ``$training_mode``
- ``pretrain``
- Benchmark pretraining
* -
- ``finetune_fw``
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
* -
- ``finetune_lora``
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
* -
- ``HF_finetune_lora``
- Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
* - ``$datatype``
- ``FP8`` or ``BF16``
- Only Llama 3.1 8B supports FP8 precision.
* - ``$model_repo``
- ``Llama-3.3-70B``
- `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
* -
- ``Llama-3.1-8B``
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
* -
- ``Llama-3.1-70B``
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
* -
- ``Llama-2-70B``
- `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
* -
- ``Flux``
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
.. note::
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
.. rubric:: Fine-tuning
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
.. code-block:: shell
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
.. code-block:: shell
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
.. rubric:: Benchmarking examples
Here are some example commands to get started pretraining and fine-tuning with various model configurations.
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
* Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
* Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
* Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
* Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

View File

@@ -9,28 +9,27 @@ Training a model with PyTorch for ROCm
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
(``rocm/pytorch-training:v25.5``) image
provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
software components to accelerate training workloads:
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
training workloads:
+--------------------------+--------------------------------+
| Software component | Version |
+==========================+================================+
| ROCm | 6.3.4 |
+--------------------------+--------------------------------+
| PyTorch | 2.7.0a0+git637433 |
| PyTorch | 2.8.0a0+git7d205b2 |
+--------------------------+--------------------------------+
| Python | 3.10 |
| Python | 3.10.17 |
+--------------------------+--------------------------------+
| Transformer Engine | 1.12.0.dev0+25a33da |
| Transformer Engine | 1.14.0+2f85f5f2 |
+--------------------------+--------------------------------+
| Flash Attention | 3.0.0 |
| Flash Attention | 3.0.0.post1 |
+--------------------------+--------------------------------+
| hipBLASLt | git53b53bf |
| hipBLASLt | 0.15.0-8c6919d |
+--------------------------+--------------------------------+
| Triton | 3.2.0 |
| Triton | 3.3.0 |
+--------------------------+--------------------------------+
.. _amd-pytorch-training-model-support:
@@ -40,395 +39,393 @@ Supported models
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
* Llama 3.3 70B
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
* Llama 3.1 8B
{% set unified_docker = data.unified_docker.latest %}
{% set model_groups = data.model_groups %}
* Llama 3.1 70B
.. raw:: html
* Llama 2 70B
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row">
<div class="col-2 me-2 model-param-head">Workload</div>
<div class="row col-10">
{% for model_group in model_groups %}
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
* FLUX.1-dev
<div class="row mt-1">
<div class="col-2 me-2 model-param-head">Model</div>
<div class="row col-10">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. note::
.. note::
Only these models are supported in the following steps.
Some models require an external license agreement through a third party (for example, Meta).
Some models, such as Llama 3, require an external license agreement through
a third party (for example, Meta).
.. _amd-pytorch-training-performance-measurements:
.. _amd-pytorch-training-performance-measurements:
Performance measurements
========================
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
.. note::
The performance data presented in
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
page provides reference throughput and latency measurements for training
popular AI models.
System validation
=================
.. note::
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X accelerators or ROCm software.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
System validation
=================
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
Benchmarking
============
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
Once the setup is complete, choose between two options to start benchmarking:
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt validate configurations and run conditions outside those described.
.. tab-set::
Benchmarking
============
.. tab-item:: MAD-integrated benchmarking
Once the setup is complete, choose between two options to start benchmarking:
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. tab-set::
.. code-block:: shell
.. tab-item:: MAD-integrated benchmarking
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
using one GPU with the float16 data type on the host machine.
.. code-block:: shell
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
{% for model_group in model_groups %}
{% for model in model_group.models %}
The available models for MAD-integrated benchmarking are:
.. container:: model-doc {{ model.mad_tag }}
* ``pyt_train_llama-3.3-70b``
For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one GPU with the {{ model.precision }} data type on the host machine.
* ``pyt_train_llama-3.1-8b``
.. code-block:: shell
* ``pyt_train_llama-3.1-70b``
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800
* ``pyt_train_flux``
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv``.
MAD launches a Docker container with the name
``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
model are collected in the following path: ``~/MAD/perf.csv``.
{% endfor %}
{% endfor %}
.. tab-item:: Standalone benchmarking
.. tab-item:: Standalone benchmarking
.. rubric:: Download the Docker image and required packages
.. rubric:: Download the Docker image and required packages
Use the following command to pull the Docker image from Docker Hub.
Use the following command to pull the Docker image from Docker Hub.
.. code-block:: shell
.. code-block:: shell
docker pull rocm/pytorch-training:v25.5
docker pull {{ unified_docker.pull_tag }}
Run the Docker container.
Run the Docker container.
.. code-block:: shell
.. code-block:: shell
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
Use these commands if you exit the ``training_env`` container and need to return to it.
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
docker start training_env
docker exec -it training_env bash
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
.. code-block:: shell
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
.. rubric:: Prepare training datasets and dependencies
.. rubric:: Prepare training datasets and dependencies
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
export HF_TOKEN=$your_personal_hugging_face_access_token
Run the setup script to install libraries and datasets needed for benchmarking.
Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
.. code-block:: shell
./pytorch_benchmark_setup.sh
./pytorch_benchmark_setup.sh
``pytorch_benchmark_setup.sh`` installs the following libraries:
.. container:: model-doc pyt_train_llama-3.1-8b
.. list-table::
:header-rows: 1
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
* - Library
- Benchmark model
- Reference
.. list-table::
:header-rows: 1
* - ``accelerate``
- Llama 3.1 8B, FLUX
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - Library
- Reference
* - ``datasets``
- Llama 3.1 8B, 70B, FLUX
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``torchdata``
- Llama 3.1 70B
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``tomli``
- Llama 3.1 70B
- `Tomli <https://pypi.org/project/tomli/>`_
.. container:: model-doc pyt_train_llama-3.1-70b
* - ``tiktoken``
- Llama 3.1 70B
- `tiktoken <https://github.com/openai/tiktoken>`_
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
* - ``blobfile``
- Llama 3.1 70B
- `blobfile <https://pypi.org/project/blobfile/>`_
.. list-table::
:header-rows: 1
* - ``tabulate``
- Llama 3.1 70B
- `tabulate <https://pypi.org/project/tabulate/>`_
* - Library
- Reference
* - ``wandb``
- Llama 3.1 70B
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``sentencepiece``
- Llama 3.1 70B, FLUX
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``torchdata``
- `TorchData <https://pytorch.org/data/beta/index.html>`_
* - ``tensorboard``
- Llama 3.1 70 B, FLUX
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``tomli``
- `Tomli <https://pypi.org/project/tomli/>`_
* - ``csvkit``
- FLUX
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* - ``tiktoken``
- `tiktoken <https://github.com/openai/tiktoken>`_
* - ``deepspeed``
- FLUX
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* - ``blobfile``
- `blobfile <https://pypi.org/project/blobfile/>`_
* - ``diffusers``
- FLUX
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* - ``tabulate``
- `tabulate <https://pypi.org/project/tabulate/>`_
* - ``GitPython``
- FLUX
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
* - ``wandb``
- `Weights & Biases <https://github.com/wandb/wandb>`_
* - ``opencv-python-headless``
- FLUX
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* - ``peft``
- FLUX
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
* - ``protobuf``
- FLUX
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
.. container:: model-doc pyt_train_flux
* - ``pytest``
- FLUX
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
* - ``python-dotenv``
- FLUX
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
.. list-table::
:header-rows: 1
* - ``seaborn``
- FLUX
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - Library
- Reference
* - ``transformers``
- FLUX
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
Along with the following datasets:
* - ``csvkit``
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
* - ``deepspeed``
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
* - ``diffusers``
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
* - ``GitPython``
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
.. rubric:: Pretraining
* - ``opencv-python-headless``
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
To start the pretraining benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
* - ``peft``
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
.. code-block:: shell
* - ``protobuf``
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
* - ``pytest``
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
.. list-table::
:header-rows: 1
* - ``python-dotenv``
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
* - Name
- Options
- Description
* - ``seaborn``
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
* - ``$training_mode``
- ``pretrain``
- Benchmark pretraining
* - ``transformers``
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
* -
- ``finetune_fw``
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
* -
- ``finetune_lora``
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
* -
- ``HF_finetune_lora``
- Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
{% for model_group in model_groups %}
{% for model in model_group.models %}
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
* - ``$datatype``
- ``FP8`` or ``BF16``
- Only Llama 3.1 8B supports FP8 precision.
.. container:: model-doc {{ model.mad_tag }}
* - ``$model_repo``
- ``Llama-3.3-70B``
- `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
.. rubric:: Pretraining
* -
- ``Llama-3.1-8B``
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
To start the pre-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
* -
- ``Llama-3.1-70B``
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
.. code-block:: shell
* -
- ``Llama-2-70B``
- `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
* -
- ``Flux``
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
.. list-table::
:header-rows: 1
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
* - Name
- Options
- Description
.. note::
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
* - ``$datatype``
- ``BF16`` or ``FP8``
- Only Llama 3.1 8B supports FP8 precision.
{% else %}
* - ``$datatype``
- ``BF16``
- Only Llama 3.1 8B supports FP8 precision.
{% endif %}
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
.. rubric:: Fine-tuning
{% if model.mad_tag == "pyt_train_flux" %}
.. container:: model-doc {{ model.mad_tag }}
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
.. note::
.. code-block:: shell
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
{% endif %}
{% endif %}
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
{% if model_group.tag == "fine-tuning" %}
.. container:: model-doc {{ model.mad_tag }}
Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
.. rubric:: Fine-tuning
.. code-block:: shell
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
.. code-block:: shell
.. rubric:: Benchmarking examples
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
Here are some example commands to get started pretraining and fine-tuning with various model configurations.
.. list-table::
:header-rows: 1
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
* - Name
- Options
- Description
.. code-block:: shell
* - ``$training_mode``
- ``finetune_fw``
- Full weight fine-tuning (BF16 supported)
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
* -
- ``finetune_lora``
- LoRA fine-tuning (BF16 supported)
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
* -
- ``finetune_qlora``
- QLoRA fine-tuning (BF16 supported)
.. code-block:: shell
* -
- ``HF_finetune_lora``
- LoRA fine-tuning with Hugging Face PEFT
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
* - ``$datatype``
- ``BF16``
- All models support BF16.
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
* - ``$sequence_length``
- Between 2048 and 16384.
- Sequence length for the language model.
.. code-block:: shell
.. note::
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
{{ model.model }} currently supports the following fine-tuning methods:
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
{% for method in model.training_modes %}
* ``{{ method }}``
{% endfor %}
{% if model.training_modes|length < 4 %}
.. code-block:: shell
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
does not currently provide YAML configuration files for other combinations of
model to fine-tuning method
However, you can still configure your own YAML files to enable support for
fine-tuning methods not listed here by following existing patterns in the
``/workspace/torchtune/recipes/configs`` directory.
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
.. rubric:: Benchmarking examples
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
* Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
* Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
* Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
* Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
.. code-block:: shell
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
Previous versions
=================

1
half

Submodule half deleted from 1ddada2251

1
hip

Submodule hip deleted from 22b0b2eb9a

Submodule hip-tests deleted from dc28111737

Submodule hipBLAS deleted from 2656692311

Submodule hipBLAS-common deleted from 7c1566ba46

Submodule hipBLASLt deleted from 4d62e135cf

1
hipCUB

Submodule hipCUB deleted from a6005943c5

1
hipFFT

Submodule hipFFT deleted from 396169c84a

Submodule hipRAND deleted from d2516cc199

Submodule hipSOLVER deleted from ca0de3c9c9

Submodule hipSPARSE deleted from a6c62e48eb

Submodule hipSPARSELt deleted from f3f4f590a4

Submodule hipTensor deleted from e5529b9291

Submodule hipfort deleted from f3d6aa3e86

Submodule hipother deleted from 49b1588f83

Submodule llvm-project deleted from c87081df21

1
rccl

Submodule rccl deleted from e72b592201

1
rdc

Submodule rdc deleted from 2d3a8d3017

1
rocAL

Submodule rocAL deleted from 373ef865ac

Submodule rocALUTION deleted from cb256de357

Submodule rocBLAS deleted from 80e5394d6a

Submodule rocDecode deleted from a2a7b63cad

1
rocFFT

Submodule rocFFT deleted from 058ba87fdc

Submodule rocJPEG deleted from 73d36d35d9

Submodule rocPRIM deleted from d8771ec18a

Submodule rocPyDecode deleted from 848e49d29d

Submodule rocRAND deleted from 4d5d3a88d1

Submodule rocSHMEM deleted from d742043443

Submodule rocSOLVER deleted from db754e3f55

Submodule rocSPARSE deleted from 4953add0ae

Submodule rocThrust deleted from 6bf2777019

Submodule rocWMMA deleted from 1a5b623166

Submodule rocm-cmake deleted from ecc716b97c

Submodule rocm-core deleted from 69b59e5b5e

Submodule rocm-examples deleted from 3bbd2987a3

Submodule rocm_smi_lib deleted from e68c0d1767

Submodule rocminfo deleted from 6ea2ba38c8

Submodule rocprofiler deleted from 40da7312a0

Submodule rocprofiler-sdk deleted from e8e49fe769

Submodule rocr_debug_agent deleted from 9eec1a52a3

Submodule roctracer deleted from f55a694381

1
rpp

Submodule rpp deleted from 5fb204ca70