Add pybind11 as a pip module requirement for azure.

[Ex CI] revert PRIM default branch to develop (#4960 )
[Ex CI] allow rerun jobs to upload artifacts (#4959 )
2026-01-10 15:18:11 -05:00 · 2025-06-24 08:06:52 -05:00 · 2025-06-23 16:35:02 -04:00 · 2025-06-23 15:37:52 -04:00 · 2025-06-23 15:13:11 -04:00 · 2025-06-23 13:45:50 -04:00
78 changed files with 884 additions and 565 deletions
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -61,12 +61,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -27,6 +27,7 @@ parameters:
    - numpy
    - tomli
    - scipy
+    - pybind11
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -60,12 +60,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
@@ -170,7 +170,7 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_${{ job.shard }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -64,12 +64,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,12 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - gfx942-staging:
-      target: gfx942
-      source: staging
-    - gfx90a-staging:
-      target: gfx90a
-      source: staging
+    - { os: ubuntu2204, target: gfx942, source: staging }
+    - { os: ubuntu2204, target: gfx90a, source: staging }
+    - { os: ubuntu2204, target: gfx1201, source: staging }
+    - { os: ubuntu2204, target: gfx1100, source: staging }
+    - { os: ubuntu2204, target: gfx1030, source: staging }
+    - { os: ubuntu2404, target: gfx942, source: staging }
+    - { os: ubuntu2404, target: gfx90a, source: staging }
+    - { os: ubuntu2404, target: gfx1201, source: staging }
+    - { os: ubuntu2404, target: gfx1100, source: staging }
+    - { os: ubuntu2404, target: gfx1030, source: staging }
+    - { os: almalinux8, target: gfx942, source: staging }
+    - { os: almalinux8, target: gfx90a, source: staging }
+    - { os: almalinux8, target: gfx1201, source: staging }
+    - { os: almalinux8, target: gfx1100, source: staging }
+    - { os: almalinux8, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -16,9 +25,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
+    - clr
    - composable_kernel
    - half
-    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -83,7 +92,7 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -108,9 +117,9 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        skipLibraryLinking: true
-        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -26,7 +26,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -38,7 +38,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
+    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -46,4 +46,5 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
+      artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -109,7 +109,7 @@ parameters:
      hasGpuTarget: false
    hipCUB:
      pipelineId: $(HIPCUB_PIPELINE_ID)
-      stagingBranch: release-staging/rocm-rel-7.0
+      stagingBranch: develop
      mainlineBranch: develop
      hasGpuTarget: true
    hipFFT:
@@ -129,7 +129,7 @@ parameters:
      hasGpuTarget: false
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
-      stagingBranch: release-staging/rocm-rel-7.0
+      stagingBranch: develop
      mainlineBranch: develop
      hasGpuTarget: true
    hipSOLVER:
@@ -264,7 +264,7 @@ parameters:
      hasGpuTarget: false
    rocPRIM:
      pipelineId: $(ROCPRIM_PIPELINE_ID)
-      stagingBranch: release-staging/rocm-rel-7.0
+      stagingBranch: develop
      mainlineBranch: develop
      hasGpuTarget: true
    rocprofiler:
@@ -304,7 +304,7 @@ parameters:
      hasGpuTarget: false
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
-      stagingBranch: release-staging/rocm-rel-7.0
+      stagingBranch: develop
      mainlineBranch: develop
      hasGpuTarget: true
    rocr_debug_agent:
@@ -329,7 +329,7 @@ parameters:
      hasGpuTarget: false
    rocThrust:
      pipelineId: $(ROCTHRUST_PIPELINE_ID)
-      stagingBranch: release-staging/rocm-rel-7.0
+      stagingBranch: develop
      mainlineBranch: develop
      hasGpuTarget: true
    roctracer:
@@ -438,14 +438,14 @@ steps:
      targetType: inline
      script: |
        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
-        sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
        done
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,195 +0,0 @@
-[submodule "AMDMIGraphX"]
-	path = AMDMIGraphX
-	url = https://github.com/ROCm/AMDMIGraphX.git
-[submodule "MIOpen"]
-	path = MIOpen
-	url = https://github.com/ROCm/MIOpen.git
-[submodule "MIVisionX"]
-	path = MIVisionX
-	url = https://github.com/ROCm/MIVisionX.git
-[submodule "ROCR-Runtime"]
-	path = ROCR-Runtime
-	url = https://github.com/ROCm/ROCR-Runtime.git
-[submodule "ROCdbgapi"]
-	path = ROCdbgapi
-	url = https://github.com/ROCm/ROCdbgapi.git
-[submodule "ROCgdb"]
-	path = ROCgdb
-	url = https://github.com/ROCm/ROCgdb.git
-[submodule "ROCmValidationSuite"]
-	path = ROCmValidationSuite
-	url = https://github.com/ROCm/ROCmValidationSuite.git
-[submodule "Tensile"]
-	path = Tensile
-	url = https://github.com/ROCm/Tensile.git
-[submodule "TransferBench"]
-	path = TransferBench
-	url = https://github.com/ROCm/TransferBench.git
-[submodule "amdsmi"]
-	path = amdsmi
-	url = https://github.com/ROCm/amdsmi.git
-[submodule "openmp-extras/aomp"]
-	path = openmp-extras/aomp
-	url = https://github.com/ROCm/aomp.git
-[submodule "openmp-extras/aomp-extras"]
-	path = openmp-extras/aomp-extras
-	url = https://github.com/ROCm/aomp-extras.git
-[submodule "ROCm"]
-	path = ROCm
-	url = https://github.com/ROCm/ROCm.git
-[submodule "clr"]
-	path = clr
-	url = https://github.com/ROCm/clr.git
-[submodule "composable_kernel"]
-	path = composable_kernel
-	url = https://github.com/ROCm/composable_kernel.git
-[submodule "rocm_bandwidth_test"]
-	path = rocm_bandwidth_test
-	url = https://github.com/ROCm/rocm_bandwidth_test.git
-[submodule "openmp-extras/flang"]
-	path = openmp-extras/flang
-	url = https://github.com/ROCm/flang.git
-[submodule "half"]
-	path = half
-	url = https://github.com/ROCm/half.git
-[submodule "hip"]
-	path = hip
-	url = https://github.com/ROCm/hip.git
-[submodule "hip-tests"]
-	path = hip-tests
-	url = https://github.com/ROCm/hip-tests.git
-[submodule "hipBLAS"]
-	path = hipBLAS
-	url = https://github.com/ROCm/hipBLAS.git
-[submodule "hipBLAS-common"]
-	path = hipBLAS-common
-	url = https://github.com/ROCm/hipBLAS-common.git
-[submodule "hipBLASLt"]
-	path = hipBLASLt
-	url = https://github.com/ROCm/hipBLASLt.git
-[submodule "hipCUB"]
-	path = hipCUB
-	url = https://github.com/ROCm/hipCUB.git
-[submodule "hipFFT"]
-	path = hipFFT
-	url = https://github.com/ROCm/hipFFT.git
-[submodule "hipRAND"]
-	path = hipRAND
-	url = https://github.com/ROCm/hipRAND.git
-[submodule "hipSOLVER"]
-	path = hipSOLVER
-	url = https://github.com/ROCm/hipSOLVER.git
-[submodule "hipSPARSE"]
-	path = hipSPARSE
-	url = https://github.com/ROCm/hipSPARSE.git
-[submodule "hipSPARSELt"]
-	path = hipSPARSELt
-	url = https://github.com/ROCm/hipSPARSELt.git
-[submodule "hipTensor"]
-	path = hipTensor
-	url = https://github.com/ROCm/hipTensor.git
-[submodule "hipfort"]
-	path = hipfort
-	url = https://github.com/ROCm/hipfort.git
-[submodule "HIPIFY"]
-	path = HIPIFY
-	url = https://github.com/ROCm/hipify.git
-[submodule "hipother"]
-	path = hipother
-	url = https://github.com/ROCm/hipother.git
-[submodule "llvm-project"]
-	path = llvm-project
-	url = https://github.com/ROCm/llvm-project.git
-[submodule "rccl"]
-	path = rccl
-	url = https://github.com/ROCm/rccl.git
-[submodule "rdc"]
-	path = rdc
-	url = https://github.com/ROCm/rdc.git
-[submodule "rocAL"]
-	path = rocAL
-	url = https://github.com/ROCm/rocAL.git
-[submodule "rocALUTION"]
-	path = rocALUTION
-	url = https://github.com/ROCm/rocALUTION.git
-[submodule "rocBLAS"]
-	path = rocBLAS
-	url = https://github.com/ROCm/rocBLAS.git
-[submodule "rocDecode"]
-	path = rocDecode
-	url = https://github.com/ROCm/rocDecode.git
-[submodule "rocFFT"]
-	path = rocFFT
-	url = https://github.com/ROCm/rocFFT.git
-[submodule "rocJPEG"]
-	path = rocJPEG
-	url = https://github.com/ROCm/rocJPEG.git
-[submodule "rocPRIM"]
-	path = rocPRIM
-	url = https://github.com/ROCm/rocPRIM.git
-[submodule "rocPyDecode"]
-	path = rocPyDecode
-	url = https://github.com/ROCm/rocPyDecode.git
-[submodule "rocRAND"]
-	path = rocRAND
-	url = https://github.com/ROCm/rocRAND.git
-[submodule "rocSHMEM"]
-	path = rocSHMEM
-	url = https://github.com/ROCm/rocSHMEM.git
-[submodule "rocSOLVER"]
-	path = rocSOLVER
-	url = https://github.com/ROCm/rocSOLVER.git
-[submodule "rocSPARSE"]
-	path = rocSPARSE
-	url = https://github.com/ROCm/rocSPARSE.git
-[submodule "rocThrust"]
-	path = rocThrust
-	url = https://github.com/ROCm/rocThrust.git
-[submodule "rocWMMA"]
-	path = rocWMMA
-	url = https://github.com/ROCm/rocWMMA.git
-[submodule "rocm-cmake"]
-	path = rocm-cmake
-	url = https://github.com/ROCm/rocm-cmake.git
-[submodule "rocm-core"]
-	path = rocm-core
-	url = https://github.com/ROCm/rocm-core.git
-[submodule "rocm-examples"]
-	path = rocm-examples
-	url = https://github.com/ROCm/rocm-examples.git
-[submodule "rocm_smi_lib"]
-	path = rocm_smi_lib
-	url = https://github.com/ROCm/rocm_smi_lib.git
-[submodule "rocminfo"]
-	path = rocminfo
-	url = https://github.com/ROCm/rocminfo.git
-[submodule "rocprofiler"]
-	path = rocprofiler
-	url = https://github.com/ROCm/rocprofiler.git
-[submodule "rocprofiler-compute"]
-	path = rocprofiler-compute
-	url = https://github.com/ROCm/rocprofiler-compute.git
-[submodule "rocprofiler-register"]
-	path = rocprofiler-register
-	url = https://github.com/ROCm/rocprofiler-register.git
-[submodule "ROCK-Kernel-Driver"]
-	path = ROCK-Kernel-Driver
-	url = https://github.com/ROCm/ROCK-Kernel-Driver.git
-[submodule "rocprofiler-sdk"]
-	path = rocprofiler-sdk
-	url = https://github.com/ROCm/rocprofiler-sdk.git
-[submodule "rocprofiler-systems"]
-	path = rocprofiler-systems
-	url = https://github.com/ROCm/rocprofiler-systems.git
-[submodule "rocr_debug_agent"]
-	path = rocr_debug_agent
-	url = https://github.com/ROCm/rocr_debug_agent.git
-[submodule "roctracer"]
-	path = roctracer
-	url = https://github.com/ROCm/roctracer.git
-[submodule "rpp"]
-	path = rpp
-	url = https://github.com/ROCm/rpp.git
-[submodule "spirv-llvm-translator"]
-	path = spirv-llvm-translator
-	url = https://github.com/ROCm/spirv-llvm-translator.git
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -18,11 +18,18 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - PyTorch version
     - Resources

+   * - 6.4.0
+     - 0.9.0.1
+     - 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
+
   * - 6.3.1
     - 0.8.5 (0.8.6.dev)
     - 2.7.0
     - 
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.8.5-20250521>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_

   * - 6.3.1
@@ -57,7 +64,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - 0.6.4
     - 2.5.0
     - 
-       * :doc:`Documentation <vllm-0.4.3>`
+       * :doc:`Documentation <vllm-0.6.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_

   * - 6.2.0
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -18,11 +18,18 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - PyTorch version
     - Resources

-   * - v25.5
+   * - v25.6
     - 6.3.4
-     - 0.4.35
+     - 2.8.0a0+git7d205b2
     - 
       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+
+   * - v25.5
+     - 6.3.4
+     - 2.7.0a0+git637433
+     - 
+       * :doc:`Documentation <pytorch-training-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_

   * - v25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -0,0 +1,437 @@
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+(``rocm/pytorch-training:v25.5``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.7.0a0+git637433              |
+--------------------------+--------------------------------+
+| Python                   | 3.10                           |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+25a33da            |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | git53b53bf                     |
+--------------------------+--------------------------------+
+| Triton                   | 3.2.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 2 70B
+
+* FLUX.1-dev
+
+.. note::
+
+   Only these models are supported in the following steps.
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.
+
+Benchmarking
+============
+
+Once the setup is complete, choose between two options to start benchmarking:
+
+.. tab-set::
+
+   .. tab-item:: MAD-integrated benchmarking
+
+      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+      directory and install the required packages on the host machine.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD
+         pip install -r requirements.txt
+
+      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
+      using one GPU with the float16 data type on the host machine.
+
+      .. code-block:: shell
+
+         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+      The available models for MAD-integrated benchmarking are:
+
+      * ``pyt_train_llama-3.3-70b``
+
+      * ``pyt_train_llama-3.1-8b``
+
+      * ``pyt_train_llama-3.1-70b``
+
+      * ``pyt_train_flux``
+
+      MAD launches a Docker container with the name
+      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
+      model are collected in the following path: ``~/MAD/perf.csv``.
+
+   .. tab-item:: Standalone benchmarking
+
+      .. rubric:: Download the Docker image and required packages
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull rocm/pytorch-training:v25.5
+
+      Run the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+
+      Use these commands if you exit the ``training_env`` container and need to return to it.
+
+      .. code-block:: shell
+
+         docker start training_env
+         docker exec -it training_env bash
+
+      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+      repository and navigate to the benchmark scripts directory
+      ``/workspace/MAD/scripts/pytorch_train``.
+
+      .. code-block:: shell
+
+         git clone https://github.com/ROCm/MAD
+         cd MAD/scripts/pytorch_train
+
+      .. rubric:: Prepare training datasets and dependencies
+
+      The following benchmarking examples require downloading models and datasets
+      from Hugging Face. To ensure successful access to gated repos, set your
+      ``HF_TOKEN``.
+
+      .. code-block:: shell
+
+         export HF_TOKEN=$your_personal_hugging_face_access_token
+
+      Run the setup script to install libraries and datasets needed for benchmarking.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_setup.sh
+
+      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Library
+           - Benchmark model
+           - Reference
+
+         * - ``accelerate``
+           - Llama 3.1 8B, FLUX
+           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+         * - ``datasets``
+           - Llama 3.1 8B, 70B, FLUX
+           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         * - ``torchdata``
+           - Llama 3.1 70B
+           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+         * - ``tomli``
+           - Llama 3.1 70B
+           - `Tomli <https://pypi.org/project/tomli/>`_
+
+         * - ``tiktoken``
+           - Llama 3.1 70B
+           - `tiktoken <https://github.com/openai/tiktoken>`_
+
+         * - ``blobfile``
+           - Llama 3.1 70B
+           - `blobfile <https://pypi.org/project/blobfile/>`_
+
+         * - ``tabulate``
+           - Llama 3.1 70B
+           - `tabulate <https://pypi.org/project/tabulate/>`_
+
+         * - ``wandb``
+           - Llama 3.1 70B
+           - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+         * - ``sentencepiece``
+           - Llama 3.1 70B, FLUX
+           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+         * - ``tensorboard``
+           - Llama 3.1 70 B, FLUX
+           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         * - ``csvkit``
+           - FLUX
+           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+         * - ``deepspeed``
+           - FLUX
+           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+         * - ``diffusers``
+           - FLUX
+           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+         * - ``GitPython``
+           - FLUX
+           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+         * - ``opencv-python-headless``
+           - FLUX
+           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+         * - ``peft``
+           - FLUX
+           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+         * - ``protobuf``
+           - FLUX
+           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+         * - ``pytest``
+           - FLUX
+           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+         * - ``python-dotenv``
+           - FLUX
+           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+         * - ``seaborn``
+           - FLUX
+           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+         * - ``transformers``
+           - FLUX
+           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+
+      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+      Along with the following datasets:
+
+      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+
+      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+
+      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+      .. rubric:: Pretraining
+
+      To start the pretraining benchmark, use the following command with the
+      appropriate options. See the following list of options and their descriptions.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Name
+           - Options
+           - Description
+
+         * - ``$training_mode``
+           - ``pretrain``
+           - Benchmark pretraining
+
+         * -
+           - ``finetune_fw``
+           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``finetune_lora``
+           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+
+         * -
+           - ``HF_finetune_lora``
+           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+
+         * - ``$datatype``
+           - ``FP8`` or ``BF16``
+           - Only Llama 3.1 8B supports FP8 precision.
+
+         * - ``$model_repo``
+           - ``Llama-3.3-70B``
+           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-8B``
+           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+
+         * - 
+           - ``Llama-3.1-70B``
+           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+         * - 
+           - ``Llama-2-70B``
+           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+
+         * - 
+           - ``Flux``
+           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+
+         * - ``$sequence_length``
+           - Sequence length for the language model.
+           - Between 2048 and 8192. 8192 by default.
+
+      .. note::
+
+         Occasionally, downloading the Flux dataset might fail. In the event of this
+         error, manually download it from Hugging Face at
+         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+         the required dataset.
+
+      .. rubric:: Fine-tuning
+
+      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+
+      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+
+      .. code-block:: shell
+
+         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
+      .. rubric:: Benchmarking examples
+
+      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+
+      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+
+      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+
+      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+
+      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+
+      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
+
+      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
+
+      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
+
+      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,27 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-(``rocm/pytorch-training:v25.5``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
+The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:

 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.4                          |
 +--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
+| PyTorch                  | 2.8.0a0+git7d205b2             |
 +--------------------------+--------------------------------+
-| Python                   | 3.10                           |
+| Python                   | 3.10.17                        |
 +--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+25a33da            |
+| Transformer Engine       | 1.14.0+2f85f5f2                |
 +--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
+| Flash Attention          | 3.0.0.post1                    |
 +--------------------------+--------------------------------+
-| hipBLASLt                | git53b53bf                     |
+| hipBLASLt                | 0.15.0-8c6919d                 |
 +--------------------------+--------------------------------+
-| Triton                   | 3.2.0                          |
+| Triton                   | 3.3.0                          |
 +--------------------------+--------------------------------+

 .. _amd-pytorch-training-model-support:
@@ -40,395 +39,393 @@ Supported models

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.

-* Llama 3.3 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-* Llama 3.1 8B
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}

-* Llama 3.1 70B
+   .. raw:: html

-* Llama 2 70B
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>

-* FLUX.1-dev
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>

-.. note::
+   .. note::

-   Only these models are supported in the following steps.
+      Some models require an external license agreement through a third party (for example, Meta).

-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
+   .. _amd-pytorch-training-performance-measurements:

-.. _amd-pytorch-training-performance-measurements:
+   Performance measurements
+   ========================

-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
+   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
+   page provides reference throughput and latency measurements for training
+   popular AI models.

-System validation
-=================
+   .. note::

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.

-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
+   System validation
+   =================

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.

-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.

-Benchmarking
-============
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

-Once the setup is complete, choose between two options to start benchmarking:
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.

-.. tab-set::
+   Benchmarking
+   ============

-   .. tab-item:: MAD-integrated benchmarking
+   Once the setup is complete, choose between two options to start benchmarking:

-      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-      directory and install the required packages on the host machine.
+   .. tab-set::

-      .. code-block:: shell
+      .. tab-item:: MAD-integrated benchmarking

-         git clone https://github.com/ROCm/MAD
-         cd MAD
-         pip install -r requirements.txt
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.

-      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
-      using one GPU with the float16 data type on the host machine.
+         .. code-block:: shell

-      .. code-block:: shell
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt

-         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}

-      The available models for MAD-integrated benchmarking are:
+         .. container:: model-doc {{ model.mad_tag }}

-      * ``pyt_train_llama-3.3-70b``
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.

-      * ``pyt_train_llama-3.1-8b``
+            .. code-block:: shell

-      * ``pyt_train_llama-3.1-70b``
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800

-      * ``pyt_train_flux``
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.

-      MAD launches a Docker container with the name
-      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
-      model are collected in the following path: ``~/MAD/perf.csv``.
+      {% endfor %}
+   {% endfor %}

-   .. tab-item:: Standalone benchmarking
+      .. tab-item:: Standalone benchmarking

-      .. rubric:: Download the Docker image and required packages
+         .. rubric:: Download the Docker image and required packages

-      Use the following command to pull the Docker image from Docker Hub.
+         Use the following command to pull the Docker image from Docker Hub.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker pull rocm/pytorch-training:v25.5
+            docker pull {{ unified_docker.pull_tag }}

-      Run the Docker container.
+         Run the Docker container.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}

-      Use these commands if you exit the ``training_env`` container and need to return to it.
+         Use these commands if you exit the ``training_env`` container and need to return to it.

-      .. code-block:: shell
+         .. code-block:: shell

-         docker start training_env
-         docker exec -it training_env bash
+            docker start training_env
+            docker exec -it training_env bash

-      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-      repository and navigate to the benchmark scripts directory
-      ``/workspace/MAD/scripts/pytorch_train``.
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.

-      .. code-block:: shell
+         .. code-block:: shell

-         git clone https://github.com/ROCm/MAD
-         cd MAD/scripts/pytorch_train
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train

-      .. rubric:: Prepare training datasets and dependencies
+         .. rubric:: Prepare training datasets and dependencies

-      The following benchmarking examples require downloading models and datasets
-      from Hugging Face. To ensure successful access to gated repos, set your
-      ``HF_TOKEN``.
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.

-      .. code-block:: shell
+         .. code-block:: shell

-         export HF_TOKEN=$your_personal_hugging_face_access_token
+            export HF_TOKEN=$your_personal_hugging_face_access_token

-      Run the setup script to install libraries and datasets needed for benchmarking.
+         Run the setup script to install libraries and datasets needed for benchmarking.

-      .. code-block:: shell
+         .. code-block:: shell

-         ./pytorch_benchmark_setup.sh
+            ./pytorch_benchmark_setup.sh

-      ``pytorch_benchmark_setup.sh`` installs the following libraries:
+         .. container:: model-doc pyt_train_llama-3.1-8b

-      .. list-table::
-         :header-rows: 1
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-         * - Library
-           - Benchmark model
-           - Reference
+            .. list-table::
+               :header-rows: 1

-         * - ``accelerate``
-           - Llama 3.1 8B, FLUX
-           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+               * - Library
+                 - Reference

-         * - ``datasets``
-           - Llama 3.1 8B, 70B, FLUX
-           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-         * - ``torchdata``
-           - Llama 3.1 70B
-           - `TorchData <https://pytorch.org/data/beta/index.html>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``tomli``
-           - Llama 3.1 70B
-           - `Tomli <https://pypi.org/project/tomli/>`_
+         .. container:: model-doc pyt_train_llama-3.1-70b

-         * - ``tiktoken``
-           - Llama 3.1 70B
-           - `tiktoken <https://github.com/openai/tiktoken>`_
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-         * - ``blobfile``
-           - Llama 3.1 70B
-           - `blobfile <https://pypi.org/project/blobfile/>`_
+            .. list-table::
+               :header-rows: 1

-         * - ``tabulate``
-           - Llama 3.1 70B
-           - `tabulate <https://pypi.org/project/tabulate/>`_
+               * - Library
+                 - Reference

-         * - ``wandb``
-           - Llama 3.1 70B
-           - `Weights & Biases <https://github.com/wandb/wandb>`_
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         * - ``sentencepiece``
-           - Llama 3.1 70B, FLUX
-           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_

-         * - ``tensorboard``
-           - Llama 3.1 70 B, FLUX
-           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_

-         * - ``csvkit``
-           - FLUX
-           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_

-         * - ``deepspeed``
-           - FLUX
-           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_

-         * - ``diffusers``
-           - FLUX
-           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_

-         * - ``GitPython``
-           - FLUX
-           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_

-         * - ``opencv-python-headless``
-           - FLUX
-           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-         * - ``peft``
-           - FLUX
-           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         * - ``protobuf``
-           - FLUX
-           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+         .. container:: model-doc pyt_train_flux

-         * - ``pytest``
-           - FLUX
-           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-         * - ``python-dotenv``
-           - FLUX
-           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+            .. list-table::
+               :header-rows: 1

-         * - ``seaborn``
-           - FLUX
-           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+               * - Library
+                 - Reference

-         * - ``transformers``
-           - FLUX
-           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-      Along with the following datasets:
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-      .. rubric:: Pretraining
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-      To start the pretraining benchmark, use the following command with the
-      appropriate options. See the following list of options and their descriptions.
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-      .. code-block:: shell
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-      .. list-table::
-         :header-rows: 1
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-         * - Name
-           - Options
-           - Description
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-         * - ``$training_mode``
-           - ``pretrain``
-           - Benchmark pretraining
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         * -
-           - ``finetune_fw``
-           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * -
-           - ``finetune_lora``
-           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

-         * -
-           - ``HF_finetune_lora``
-           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}

-         * - ``$datatype``
-           - ``FP8`` or ``BF16``
-           - Only Llama 3.1 8B supports FP8 precision.
+         .. container:: model-doc {{ model.mad_tag }}

-         * - ``$model_repo``
-           - ``Llama-3.3-70B``
-           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
+            .. rubric:: Pretraining

-         * - 
-           - ``Llama-3.1-8B``
-           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-         * - 
-           - ``Llama-3.1-70B``
-           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+            .. code-block:: shell

-         * - 
-           - ``Llama-2-70B``
-           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length

-         * - 
-           - ``Flux``
-           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+            .. list-table::
+               :header-rows: 1

-         * - ``$sequence_length``
-           - Sequence length for the language model.
-           - Between 2048 and 8192. 8192 by default.
+               * - Name
+                 - Options
+                 - Description

-      .. note::
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}

-         Occasionally, downloading the Flux dataset might fail. In the event of this
-         error, manually download it from Hugging Face at
-         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-         the required dataset.
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.

-      .. rubric:: Fine-tuning
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}

-      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
-      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
+               .. note::

-      .. code-block:: shell
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}

-         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}

-      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
-      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
+            .. rubric:: Fine-tuning

-      .. code-block:: shell
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+            .. code-block:: shell

-      .. rubric:: Benchmarking examples
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

-      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
+            .. list-table::
+               :header-rows: 1

-      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
+               * - Name
+                 - Options
+                 - Description

-        .. code-block:: shell
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)

-      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)

-        .. code-block:: shell
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT

-           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.

-      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.

-        .. code-block:: shell
+            .. note::

-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
+               {{ model.model }} currently supports the following fine-tuning methods:

-      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}

-        .. code-block:: shell
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}

-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
+               .. rubric:: Benchmarking examples

-      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
-
-      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
-
-      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
-
-      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
-
-      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

 Previous versions
 =================
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/openmp-extras/aomp
+++ b/openmp-extras/aomp
--- a/openmp-extras/aomp-extras
+++ b/openmp-extras/aomp-extras
--- a/openmp-extras/flang
+++ b/openmp-extras/flang
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
--- a/1
+++ b/1
Author	SHA1	Message	Date
Mirza Halilcevic	9b102061f4	Add pybind11 as a pip module requirement for azure.	2025-06-24 08:06:52 -05:00
Daniel Su	f20e8dec8b	[Ex CI] revert PRIM default branch to develop (#4960 )	2025-06-23 16:35:02 -04:00
Daniel Su	10e9157f39	[Ex CI] allow rerun jobs to upload artifacts (#4959 )	2025-06-23 15:37:52 -04:00
Daniel Su	a2ce6021cb	[Ex CI] add more OSs to nightly build (#4958 )	2025-06-23 15:13:11 -04:00
Peter Park	2196fc9a2f	Fix pytorch training 25.6 doc (#4956 ) * fix pytorch-training history * fix pytorch-training fix	2025-06-23 13:45:50 -04:00
Daniel Su	925689f89e	[Ex CI] enable gfx1100 builds (#4954 )	2025-06-23 11:26:35 -04:00
Peter Park	91a541f8b9	Update PyTorch training benchmark doc for v25.6 (#4950 ) * update pytorch-training docker details * add previous version * add models data * update models data id * add models picker * update data * update fmt fmt * update data yaml * update template * update data * fix * fix vllm-0.6.4 broken link * fix vllm history	2025-06-23 09:26:15 -04:00