mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
9 Commits
cpattigi-p
...
docs/7.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a4b1b2cc67 | ||
|
|
4f592f8949 | ||
|
|
ac2df2961d | ||
|
|
f20e8dec8b | ||
|
|
10e9157f39 | ||
|
|
a2ce6021cb | ||
|
|
2196fc9a2f | ||
|
|
925689f89e | ||
|
|
91a541f8b9 |
@@ -61,12 +61,12 @@ parameters:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
|
||||
@@ -60,12 +60,12 @@ parameters:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
|
||||
@@ -170,7 +170,7 @@ jobs:
|
||||
|
||||
- ${{ if eq(parameters.unifiedBuild, False) }}:
|
||||
- ${{ each job in parameters.jobMatrix.testJobs }}:
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_${{ job.shard }}
|
||||
- job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
|
||||
dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
|
||||
@@ -64,12 +64,12 @@ parameters:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx942 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx90a }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1201 }
|
||||
# - { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1100 }
|
||||
- { os: almalinux8, packageManager: dnf, target: gfx1030 }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
|
||||
@@ -3,12 +3,21 @@ parameters:
|
||||
- name: jobList
|
||||
type: object
|
||||
default:
|
||||
- gfx942-staging:
|
||||
target: gfx942
|
||||
source: staging
|
||||
- gfx90a-staging:
|
||||
target: gfx90a
|
||||
source: staging
|
||||
- { os: ubuntu2204, target: gfx942, source: staging }
|
||||
- { os: ubuntu2204, target: gfx90a, source: staging }
|
||||
- { os: ubuntu2204, target: gfx1201, source: staging }
|
||||
- { os: ubuntu2204, target: gfx1100, source: staging }
|
||||
- { os: ubuntu2204, target: gfx1030, source: staging }
|
||||
- { os: ubuntu2404, target: gfx942, source: staging }
|
||||
- { os: ubuntu2404, target: gfx90a, source: staging }
|
||||
- { os: ubuntu2404, target: gfx1201, source: staging }
|
||||
- { os: ubuntu2404, target: gfx1100, source: staging }
|
||||
- { os: ubuntu2404, target: gfx1030, source: staging }
|
||||
- { os: almalinux8, target: gfx942, source: staging }
|
||||
- { os: almalinux8, target: gfx90a, source: staging }
|
||||
- { os: almalinux8, target: gfx1201, source: staging }
|
||||
- { os: almalinux8, target: gfx1100, source: staging }
|
||||
- { os: almalinux8, target: gfx1030, source: staging }
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -16,9 +25,9 @@ parameters:
|
||||
- amdsmi
|
||||
- aomp-extras
|
||||
- aomp
|
||||
- clr
|
||||
- composable_kernel
|
||||
- half
|
||||
- HIP
|
||||
- hip-tests
|
||||
- hipBLAS
|
||||
- hipBLAS-common
|
||||
@@ -83,7 +92,7 @@ schedules:
|
||||
|
||||
jobs:
|
||||
- ${{ each job in parameters.jobList }}:
|
||||
- job: rocm_nightly_${{ job.target }}_${{ job.source }}
|
||||
- job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
|
||||
variables:
|
||||
- group: common
|
||||
- template: /.azuredevops/variables-global.yml
|
||||
@@ -108,9 +117,9 @@ jobs:
|
||||
parameters:
|
||||
dependencySource: ${{ job.source }}
|
||||
dependencyList: ${{ parameters.rocmDependencies }}
|
||||
os: ${{ job.os }}
|
||||
gpuTarget: ${{ job.target }}
|
||||
skipLibraryLinking: true
|
||||
skipLlvmSymlink: true
|
||||
- script: df -h
|
||||
displayName: System disk space after ROCm
|
||||
- script: du -sh $(Agent.BuildDirectory)/rocm
|
||||
|
||||
@@ -22,19 +22,16 @@ steps:
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: Download ${{ parameters.componentName }}
|
||||
inputs:
|
||||
itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
|
||||
targetPath: '$(Pipeline.Workspace)/d'
|
||||
allowPartiallySucceededBuilds: true
|
||||
${{ if parameters.aggregatePipeline }}:
|
||||
buildType: 'current'
|
||||
itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
|
||||
allowPartiallySucceededBuilds: true
|
||||
targetPath: '$(Pipeline.Workspace)/d'
|
||||
${{ else }}:
|
||||
buildType: 'specific'
|
||||
project: ROCm-CI
|
||||
specificBuildWithTriggering: true
|
||||
allowPartiallySucceededBuilds: true
|
||||
definition: ${{ parameters.pipelineId }}
|
||||
itemPattern: '**/*${{ parameters.fileFilter }}*'
|
||||
targetPath: '$(Pipeline.Workspace)/d'
|
||||
branchName: refs/heads/${{ parameters.branchName }}
|
||||
${{ if eq(parameters.componentName, 'aomp') }}:
|
||||
buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
|
||||
|
||||
@@ -26,7 +26,7 @@ steps:
|
||||
includeRootFolder: false
|
||||
archiveType: 'tar'
|
||||
tarCompression: 'gz'
|
||||
archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
|
||||
archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
|
||||
- task: DeleteFiles@1
|
||||
displayName: 'Cleanup Staging Area'
|
||||
inputs:
|
||||
@@ -38,7 +38,7 @@ steps:
|
||||
inputs:
|
||||
workingDirectory: $(Pipeline.Workspace)
|
||||
targetType: inline
|
||||
script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
|
||||
script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
|
||||
# then publish it
|
||||
- ${{ if parameters.publish }}:
|
||||
- task: PublishPipelineArtifact@1
|
||||
@@ -46,4 +46,5 @@ steps:
|
||||
displayName: '${{ parameters.artifactName }} Publish'
|
||||
retryCountOnTaskFailure: 3
|
||||
inputs:
|
||||
artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
|
||||
targetPath: '$(Build.ArtifactStagingDirectory)'
|
||||
|
||||
@@ -109,7 +109,7 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
hipCUB:
|
||||
pipelineId: $(HIPCUB_PIPELINE_ID)
|
||||
stagingBranch: release-staging/rocm-rel-7.0
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
hasGpuTarget: true
|
||||
hipFFT:
|
||||
@@ -129,7 +129,7 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
hipRAND:
|
||||
pipelineId: $(HIPRAND_PIPELINE_ID)
|
||||
stagingBranch: release-staging/rocm-rel-7.0
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
hasGpuTarget: true
|
||||
hipSOLVER:
|
||||
@@ -264,7 +264,7 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
rocPRIM:
|
||||
pipelineId: $(ROCPRIM_PIPELINE_ID)
|
||||
stagingBranch: release-staging/rocm-rel-7.0
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocprofiler:
|
||||
@@ -304,7 +304,7 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
rocRAND:
|
||||
pipelineId: $(ROCRAND_PIPELINE_ID)
|
||||
stagingBranch: release-staging/rocm-rel-7.0
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
hasGpuTarget: true
|
||||
rocr_debug_agent:
|
||||
@@ -329,7 +329,7 @@ parameters:
|
||||
hasGpuTarget: false
|
||||
rocThrust:
|
||||
pipelineId: $(ROCTHRUST_PIPELINE_ID)
|
||||
stagingBranch: release-staging/rocm-rel-7.0
|
||||
stagingBranch: develop
|
||||
mainlineBranch: develop
|
||||
hasGpuTarget: true
|
||||
roctracer:
|
||||
@@ -438,14 +438,14 @@ steps:
|
||||
targetType: inline
|
||||
script: |
|
||||
sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
|
||||
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
|
||||
- task: Bash@3
|
||||
displayName: Symlink executables from rocm/llvm/bin to rocm/bin
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
|
||||
sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
|
||||
sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
|
||||
done
|
||||
# dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
|
||||
# the convention is as follows:
|
||||
|
||||
@@ -1,3 +1,18 @@
|
||||
Datacenter
|
||||
GST
|
||||
IET
|
||||
LTO
|
||||
MX
|
||||
Microscaling
|
||||
NANOO
|
||||
ROCprof
|
||||
affinitization
|
||||
amdclang
|
||||
benefitting
|
||||
demangled
|
||||
inlined
|
||||
microscaling
|
||||
roofline
|
||||
AAC
|
||||
ABI
|
||||
ACE
|
||||
|
||||
125
docs/conf.py
125
docs/conf.py
@@ -34,69 +34,86 @@ project = "ROCm Documentation"
|
||||
project_path = os.path.abspath(".").replace("\\", "/")
|
||||
author = "Advanced Micro Devices, Inc."
|
||||
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
|
||||
version = "6.4.1"
|
||||
release = "6.4.1"
|
||||
version = "7.0 Alpha"
|
||||
release = "7.0 Alpha"
|
||||
setting_all_article_info = True
|
||||
all_article_info_os = ["linux", "windows"]
|
||||
all_article_info_author = ""
|
||||
|
||||
# pages with specific settings
|
||||
article_pages = [
|
||||
{"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
|
||||
{"file": "release/changelog", "os": ["linux"],},
|
||||
{"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
|
||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
{"file": "preview/index", "os": ["linux"],},
|
||||
{"file": "preview/release", "os": ["linux"],},
|
||||
{"file": "preview/install/index", "os": ["linux"],},
|
||||
{"file": "preview/install/instinct-driver", "os": ["linux"],},
|
||||
{"file": "preview/install/rocm", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/index", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/training", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/pre-training-megatron-lm-llama-3-8b", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/pre-training-torchtitan-llama-3-70b", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/fine-tuning-lora-llama-2-70b", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/inference", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/inference-vllm-llama-3.1-405b-fp4", "os": ["linux"],},
|
||||
{"file": "preview/benchmark-docker/inference-sglang-deepseek-r1-fp4", "os": ["linux"],},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/system-optimization/index", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/mi100", "os": ["linux"]},
|
||||
{"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
|
||||
{"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
|
||||
{"file": "how-to/system-debugging", "os": ["linux"]},
|
||||
{"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
|
||||
# {"file": "about/release-notes", "os": ["linux"], "date": "2025-06-26"},
|
||||
# {"file": "release/changelog", "os": ["linux"],},
|
||||
# {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
|
||||
# {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
|
||||
# {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
|
||||
# {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
|
||||
# {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
|
||||
# {"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
|
||||
#
|
||||
# {"file": "how-to/system-optimization/index", "os": ["linux"]},
|
||||
# {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
|
||||
# {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
|
||||
# {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
|
||||
# {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
|
||||
# {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
|
||||
# {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
|
||||
# {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
|
||||
# {"file": "how-to/system-debugging", "os": ["linux"]},
|
||||
# {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
|
||||
]
|
||||
|
||||
external_toc_path = "./sphinx/_toc.yml"
|
||||
# Options to improve documentation build time for preview release documentation
|
||||
external_toc_exclude_missing = True # don't build files that aren't in the TOC
|
||||
external_projects_remote_repository = "" # don't fetch data to resolve intersphinx xrefs
|
||||
|
||||
# Add the _extensions directory to Python's search path
|
||||
sys.path.append(str(Path(__file__).parent / 'extension'))
|
||||
@@ -122,7 +139,7 @@ html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference
|
||||
html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
|
||||
html_js_files = ["vllm-benchmark.js"]
|
||||
|
||||
html_title = "ROCm Documentation"
|
||||
html_title = "ROCm 7.0 Alpha documentation"
|
||||
|
||||
html_theme_options = {"link_main_doc": False}
|
||||
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
unified_docker:
|
||||
latest:
|
||||
pull_tag: rocm/pytorch-training:v25.6
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
||||
rocm_version: 6.4.1
|
||||
pytorch_version: 2.8.0a0+git7d205b2
|
||||
python_version: 3.10.17
|
||||
transformer_engine_version: 1.14.0+2f85f5f2
|
||||
flash_attention_version: 3.0.0.post1
|
||||
hipblaslt_version: 0.15.0-8c6919d
|
||||
triton_version: 3.3.0
|
||||
model_groups:
|
||||
- group: Pre-training
|
||||
tag: pre-training
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Fine-tuning
|
||||
tag: fine-tuning
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora, HF_finetune_lora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
||||
|
||||
@@ -18,11 +18,18 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - 6.4.0
|
||||
- 0.9.0.1
|
||||
- 2.7.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
|
||||
|
||||
* - 6.3.1
|
||||
- 0.8.5 (0.8.6.dev)
|
||||
- 2.7.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* :doc:`Documentation <vllm-0.8.5-20250521>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
|
||||
|
||||
* - 6.3.1
|
||||
@@ -57,7 +64,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
||||
- 0.6.4
|
||||
- 2.5.0
|
||||
-
|
||||
* :doc:`Documentation <vllm-0.4.3>`
|
||||
* :doc:`Documentation <vllm-0.6.4>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
|
||||
|
||||
* - 6.2.0
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
.. meta::
|
||||
:description: How to install ROCm and popular machine learning frameworks.
|
||||
:description: How to install the ROCm 7.0 preview
|
||||
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
||||
|
||||
.. _rocm-for-ai-install:
|
||||
|
||||
@@ -18,11 +18,18 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
||||
- PyTorch version
|
||||
- Resources
|
||||
|
||||
* - v25.5
|
||||
* - v25.6
|
||||
- 6.3.4
|
||||
- 0.4.35
|
||||
- 2.8.0a0+git7d205b2
|
||||
-
|
||||
* :doc:`Documentation <../pytorch-training>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
|
||||
|
||||
* - v25.5
|
||||
- 6.3.4
|
||||
- 2.7.0a0+git637433
|
||||
-
|
||||
* :doc:`Documentation <pytorch-training-v25.5>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
|
||||
|
||||
* - v25.4
|
||||
|
||||
@@ -0,0 +1,437 @@
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch for ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
|
||||
(``rocm/pytorch-training:v25.5``) image
|
||||
provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
|
||||
software components to accelerate training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.12.0.dev0+25a33da |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git53b53bf |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.2.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
|
||||
* Llama 3.3 70B
|
||||
|
||||
* Llama 3.1 8B
|
||||
|
||||
* Llama 3.1 70B
|
||||
|
||||
* Llama 2 70B
|
||||
|
||||
* FLUX.1-dev
|
||||
|
||||
.. note::
|
||||
|
||||
Only these models are supported in the following steps.
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
|
||||
using one GPU with the float16 data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
The available models for MAD-integrated benchmarking are:
|
||||
|
||||
* ``pyt_train_llama-3.3-70b``
|
||||
|
||||
* ``pyt_train_llama-3.1-8b``
|
||||
|
||||
* ``pyt_train_llama-3.1-70b``
|
||||
|
||||
* ``pyt_train_flux``
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch-training:v25.5
|
||||
|
||||
Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Benchmark model
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- Llama 3.1 8B, FLUX
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- Llama 3.1 8B, 70B, FLUX
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- Llama 3.1 70B
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- Llama 3.1 70B
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``tiktoken``
|
||||
- Llama 3.1 70B
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``blobfile``
|
||||
- Llama 3.1 70B
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``tabulate``
|
||||
- Llama 3.1 70B
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``wandb``
|
||||
- Llama 3.1 70B
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``sentencepiece``
|
||||
- Llama 3.1 70B, FLUX
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- Llama 3.1 70 B, FLUX
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- FLUX
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- FLUX
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- FLUX
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- FLUX
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- FLUX
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- FLUX
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- FLUX
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- FLUX
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- FLUX
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- FLUX
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- FLUX
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
|
||||
|
||||
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
Along with the following datasets:
|
||||
|
||||
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
|
||||
|
||||
* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To start the pretraining benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``pretrain``
|
||||
- Benchmark pretraining
|
||||
|
||||
* -
|
||||
- ``finetune_fw``
|
||||
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
|
||||
|
||||
* -
|
||||
- ``HF_finetune_lora``
|
||||
- Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
|
||||
|
||||
* - ``$datatype``
|
||||
- ``FP8`` or ``BF16``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``Llama-3.3-70B``
|
||||
- `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``Llama-3.1-8B``
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``Llama-3.1-70B``
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
|
||||
* -
|
||||
- ``Llama-2-70B``
|
||||
- `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
|
||||
|
||||
* -
|
||||
- ``Flux``
|
||||
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
|
||||
.. note::
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
|
||||
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
|
||||
|
||||
Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
|
||||
`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
Here are some example commands to get started pretraining and fine-tuning with various model configurations.
|
||||
|
||||
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
|
||||
|
||||
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
|
||||
|
||||
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||
|
||||
@@ -9,28 +9,27 @@ Training a model with PyTorch for ROCm
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
|
||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
|
||||
(``rocm/pytorch-training:v25.5``) image
|
||||
provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following
|
||||
software components to accelerate training workloads:
|
||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
|
||||
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
+--------------------------+--------------------------------+
|
||||
| Software component | Version |
|
||||
+==========================+================================+
|
||||
| ROCm | 6.3.4 |
|
||||
+--------------------------+--------------------------------+
|
||||
| PyTorch | 2.7.0a0+git637433 |
|
||||
| PyTorch | 2.8.0a0+git7d205b2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Python | 3.10 |
|
||||
| Python | 3.10.17 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Transformer Engine | 1.12.0.dev0+25a33da |
|
||||
| Transformer Engine | 1.14.0+2f85f5f2 |
|
||||
+--------------------------+--------------------------------+
|
||||
| Flash Attention | 3.0.0 |
|
||||
| Flash Attention | 3.0.0.post1 |
|
||||
+--------------------------+--------------------------------+
|
||||
| hipBLASLt | git53b53bf |
|
||||
| hipBLASLt | 0.15.0-8c6919d |
|
||||
+--------------------------+--------------------------------+
|
||||
| Triton | 3.2.0 |
|
||||
| Triton | 3.3.0 |
|
||||
+--------------------------+--------------------------------+
|
||||
|
||||
.. _amd-pytorch-training-model-support:
|
||||
@@ -40,395 +39,393 @@ Supported models
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||
|
||||
* Llama 3.3 70B
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
* Llama 3.1 8B
|
||||
{% set unified_docker = data.unified_docker.latest %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
* Llama 3.1 70B
|
||||
.. raw:: html
|
||||
|
||||
* Llama 2 70B
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-2 me-2 model-param-head">Workload</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
* FLUX.1-dev
|
||||
<div class="row mt-1">
|
||||
<div class="col-2 me-2 model-param-head">Model</div>
|
||||
<div class="row col-10">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
.. note::
|
||||
|
||||
Only these models are supported in the following steps.
|
||||
Some models require an external license agreement through a third party (for example, Meta).
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
.. _amd-pytorch-training-performance-measurements:
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements:
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
System validation
|
||||
=================
|
||||
.. note::
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
System validation
|
||||
=================
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking:
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
.. tab-set::
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
Once the setup is complete, choose between two options to start benchmarking:
|
||||
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
.. tab-set::
|
||||
|
||||
.. code-block:: shell
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
|
||||
using one GPU with the float16 data type on the host machine.
|
||||
.. code-block:: shell
|
||||
|
||||
.. code-block:: shell
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
The available models for MAD-integrated benchmarking are:
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
* ``pyt_train_llama-3.3-70b``
|
||||
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one GPU with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
* ``pyt_train_llama-3.1-8b``
|
||||
.. code-block:: shell
|
||||
|
||||
* ``pyt_train_llama-3.1-70b``
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800
|
||||
|
||||
* ``pyt_train_flux``
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull rocm/pytorch-training:v25.5
|
||||
docker pull {{ unified_docker.pull_tag }}
|
||||
|
||||
Run the Docker container.
|
||||
Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
|
||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries:
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
* - Library
|
||||
- Benchmark model
|
||||
- Reference
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ``accelerate``
|
||||
- Llama 3.1 8B, FLUX
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- Llama 3.1 8B, 70B, FLUX
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``torchdata``
|
||||
- Llama 3.1 70B
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``tomli``
|
||||
- Llama 3.1 70B
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
* - ``tiktoken``
|
||||
- Llama 3.1 70B
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
* - ``blobfile``
|
||||
- Llama 3.1 70B
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ``tabulate``
|
||||
- Llama 3.1 70B
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``wandb``
|
||||
- Llama 3.1 70B
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- Llama 3.1 70B, FLUX
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tensorboard``
|
||||
- Llama 3.1 70 B, FLUX
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
* - ``csvkit``
|
||||
- FLUX
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||
|
||||
* - ``deepspeed``
|
||||
- FLUX
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||
|
||||
* - ``diffusers``
|
||||
- FLUX
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||
|
||||
* - ``GitPython``
|
||||
- FLUX
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- FLUX
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* - ``peft``
|
||||
- FLUX
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
* - ``protobuf``
|
||||
- FLUX
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
* - ``pytest``
|
||||
- FLUX
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
* - ``python-dotenv``
|
||||
- FLUX
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ``seaborn``
|
||||
- FLUX
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``transformers``
|
||||
- FLUX
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||
|
||||
* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||
|
||||
Along with the following datasets:
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||
|
||||
* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||
|
||||
* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||
|
||||
.. rubric:: Pretraining
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||
|
||||
To start the pretraining benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||
|
||||
.. code-block:: shell
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||
|
||||
* - ``$training_mode``
|
||||
- ``pretrain``
|
||||
- Benchmark pretraining
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||
|
||||
* -
|
||||
- ``finetune_fw``
|
||||
- Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
|
||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||
|
||||
* -
|
||||
- ``HF_finetune_lora``
|
||||
- Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``FP8`` or ``BF16``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
* - ``$model_repo``
|
||||
- ``Llama-3.3-70B``
|
||||
- `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
|
||||
.. rubric:: Pretraining
|
||||
|
||||
* -
|
||||
- ``Llama-3.1-8B``
|
||||
- `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
* -
|
||||
- ``Llama-3.1-70B``
|
||||
- `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
|
||||
.. code-block:: shell
|
||||
|
||||
* -
|
||||
- ``Llama-2-70B``
|
||||
- `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
|
||||
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
||||
|
||||
* -
|
||||
- ``Flux``
|
||||
- `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
.. note::
|
||||
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
||||
* - ``$datatype``
|
||||
- ``BF16`` or ``FP8``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% else %}
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
{% endif %}
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
|
||||
with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
|
||||
.. note::
|
||||
|
||||
.. code-block:: shell
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
|
||||
{% if model_group.tag == "fine-tuning" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
|
||||
`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
.. code-block:: shell
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||
.. code-block:: shell
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
||||
|
||||
Here are some example commands to get started pretraining and fine-tuning with various model configurations.
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
.. code-block:: shell
|
||||
* - ``$training_mode``
|
||||
- ``finetune_fw``
|
||||
- Full weight fine-tuning (BF16 supported)
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
|
||||
* -
|
||||
- ``finetune_lora``
|
||||
- LoRA fine-tuning (BF16 supported)
|
||||
|
||||
* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
|
||||
* -
|
||||
- ``finetune_qlora``
|
||||
- QLoRA fine-tuning (BF16 supported)
|
||||
|
||||
.. code-block:: shell
|
||||
* -
|
||||
- ``HF_finetune_lora``
|
||||
- LoRA fine-tuning with Hugging Face PEFT
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
|
||||
* - ``$datatype``
|
||||
- ``BF16``
|
||||
- All models support BF16.
|
||||
|
||||
* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
.. code-block:: shell
|
||||
.. note::
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
|
||||
{{ model.model }} currently supports the following fine-tuning methods:
|
||||
|
||||
* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
|
||||
{% for method in model.training_modes %}
|
||||
* ``{{ method }}``
|
||||
{% endfor %}
|
||||
{% if model.training_modes|length < 4 %}
|
||||
|
||||
.. code-block:: shell
|
||||
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
||||
does not currently provide YAML configuration files for other combinations of
|
||||
model to fine-tuning method
|
||||
However, you can still configure your own YAML files to enable support for
|
||||
fine-tuning methods not listed here by following existing patterns in the
|
||||
``/workspace/torchtune/recipes/configs`` directory.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
|
||||
|
||||
* Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
|
||||
|
||||
* Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
26
docs/preview/index.md
Normal file
26
docs/preview/index.md
Normal file
@@ -0,0 +1,26 @@
|
||||
---
|
||||
myst:
|
||||
html_meta:
|
||||
"description": "AMD ROCm 7.0 Alpha documentation"
|
||||
"keywords": "Radeon, open, compute, platform, install, how, conceptual, reference, home, docs"
|
||||
---
|
||||
|
||||
# AMD ROCm 7.0 Alpha documentation
|
||||
|
||||
AMD ROCm is an open-source software platform optimized to extract HPC and AI
|
||||
workload performance from AMD Instinct™ accelerators while maintaining
|
||||
compatibility with industry software frameworks.
|
||||
|
||||
This documentation is intended to provide early access information about the ROCm
|
||||
software Alpha release. The preview release provides early access to new
|
||||
features under development for testing for users to provide feedback.
|
||||
It is not recommended for production use.
|
||||
|
||||
```{note}
|
||||
See [ROCm documentation](https://rocm.docs.amd.com/en/latest/) for the latest stable release for use in production.
|
||||
```
|
||||
|
||||
The documentation includes:
|
||||
|
||||
- [ROCm 7.0 Alpha release notes](release.rst) with feature details and support matrix
|
||||
- [Installation instructions](install/index.rst) for the ROCm 7.0 Alpha and the Instinct Driver
|
||||
28
docs/preview/install/index.rst
Normal file
28
docs/preview/install/index.rst
Normal file
@@ -0,0 +1,28 @@
|
||||
.. meta::
|
||||
:description: Installation via native package manager
|
||||
:keywords: ROCm install, installation instructions, package manager, native package manager, AMD,
|
||||
ROCm
|
||||
|
||||
****************************************
|
||||
ROCm 7.0 Alpha installation instructions
|
||||
****************************************
|
||||
|
||||
The ROCm 7.0 Alpha must be installed using your Linux distribution's native
|
||||
package manager. This release supports specific hardware and software
|
||||
configurations -- before installing, see the :ref:`supported OSes and hardware
|
||||
<alpha-system-requirements>` outlined in the Alpha release notes.
|
||||
|
||||
.. important::
|
||||
|
||||
Upgrades and downgrades are not supported. You must install any existing
|
||||
ROCm installation before installing the Alpha build.
|
||||
|
||||
.. grid:: 2
|
||||
|
||||
.. grid-item-card:: Install ROCm
|
||||
|
||||
See :doc:`Install the ROCm 7.0 Alpha via package manager <rocm>`.
|
||||
|
||||
.. grid-item-card:: Install Instinct Driver
|
||||
|
||||
See :doc:`Install the Instinct Driver via package manager <instinct-driver>`.
|
||||
212
docs/preview/install/instinct-driver.rst
Normal file
212
docs/preview/install/instinct-driver.rst
Normal file
@@ -0,0 +1,212 @@
|
||||
***********************************************
|
||||
Install the Instinct Driver via package manager
|
||||
***********************************************
|
||||
|
||||
This section describes how to install the Instinct Driver using ``apt`` on
|
||||
Ubuntu 22.04 or 24.04, or ``dnf`` on Red Hat Enterprise Linux 9.6.
|
||||
|
||||
.. important::
|
||||
|
||||
Upgrades and downgrades are not supported. You must uninstall any existing
|
||||
ROCm installation before installing the preview build.
|
||||
|
||||
Prerequisites
|
||||
=============
|
||||
|
||||
Before installing, complete the following prerequisites.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
Install kernel headers.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
Install kernel headers.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
1. Register your Enterprise Linux.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
subscription-manager register --username <username> --password <password>
|
||||
subscription-manager attach --auto
|
||||
|
||||
2. Update your Enterprise Linux.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf update --releasever=9.6 --exclude=\*release\*
|
||||
|
||||
3. Install kernel headers.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf install "kernel-headers-$(uname -r)" "kernel-devel-$(uname -r)" "kernel-devel-matched-$(uname -r)"
|
||||
|
||||
Register ROCm repositories
|
||||
==========================
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
1. Add the package signing key.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Make the directory if it doesn't exist yet.
|
||||
# This location is recommended by the distribution maintainers.
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
# Download the key, convert the signing-key to a full
|
||||
# keyring required by apt and store in the keyring directory.
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
|
||||
2. Register the kernel mode driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/amdgpu.list
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
1. Add the package signing key.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Make the directory if it doesn't exist yet.
|
||||
# This location is recommended by the distribution maintainers.
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
# Download the key, convert the signing-key to a full
|
||||
# keyring required by apt and store in the keyring directory.
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
|
||||
2. Register the kernel mode driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/amdgpu.list
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo tee /etc/yum.repos.d/amdgpu.repo <<EOF
|
||||
[amdgpu]
|
||||
name=amdgpu
|
||||
baseurl=https://repo.radeon.com/amdgpu/30.10_alpha/rhel/9.6/main/x86_64/
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF
|
||||
sudo dnf clean all
|
||||
|
||||
Install the kernel driver
|
||||
=========================
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install amdgpu-dkms
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install amdgpu-dkms
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf install amdgpu-dkms
|
||||
|
||||
Uninstalling
|
||||
============
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
1. Uninstall the kernel mode driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove amdgpu-dkms
|
||||
|
||||
2. Remove AMDGPU repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/apt/sources.list.d/amdgpu.list
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/apt/*
|
||||
sudo apt clean all
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
1. Uninstall the kernel mode driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove amdgpu-dkms
|
||||
|
||||
2. Remove AMDGPU repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/apt/sources.list.d/amdgpu.list
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/apt/*
|
||||
sudo apt clean all
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
1. Uninstall the kernel mode driver.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf remove amdgpu-dkms
|
||||
|
||||
2. Remove AMDGPU repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/yum.repos.d/amdgpu.repo
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/dnf
|
||||
sudo dnf clean all
|
||||
288
docs/preview/install/rocm.rst
Normal file
288
docs/preview/install/rocm.rst
Normal file
@@ -0,0 +1,288 @@
|
||||
**********************************************
|
||||
Install the ROCm 7.0 Alpha via package manager
|
||||
**********************************************
|
||||
|
||||
This page describes how to install the ROCm 7.0 Alpha build using ``apt`` on
|
||||
Ubuntu 22.04 or 24.04, or ``dnf`` on Red Hat Enterprise Linux 9.6.
|
||||
|
||||
.. important::
|
||||
|
||||
Upgrades and downgrades are not supported. You must uninstall any existing
|
||||
ROCm installation before installing the preview build.
|
||||
|
||||
Prerequisites
|
||||
=============
|
||||
|
||||
Before installing, complete the following prerequisites.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
1. Install development packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install python3-setuptools python3-wheel
|
||||
|
||||
2. Configure user permissions for GPU access.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo usermod -a -G render,video $LOGNAME
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
1. Install development packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install python3-setuptools python3-wheel
|
||||
|
||||
2. Configure user permissions for GPU access.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo usermod -a -G render,video $LOGNAME
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
1. Register your Enterprise Linux.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
subscription-manager register --username <username> --password <password>
|
||||
subscription-manager attach --auto
|
||||
|
||||
2. Update your Enterprise Linux.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf update --releasever=9.6 --exclude=\*release\*
|
||||
|
||||
3. Install additional package repositories.
|
||||
|
||||
Add the EPEL repository:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
|
||||
sudo rpm -ivh epel-release-latest-9.noarch.rpm
|
||||
|
||||
Enable the CodeReady Linux Build (CRB) repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf install dnf-plugin-config-manager
|
||||
sudo crb enable
|
||||
|
||||
4. Install development packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf install python3-setuptools python3-wheel
|
||||
|
||||
5. Configure user permissions for GPU access.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo usermod -a -G render,video $LOGNAME
|
||||
|
||||
Register ROCm repositories
|
||||
==========================
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
1. Add the package signing key.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Make the directory if it doesn't exist yet.
|
||||
# This location is recommended by the distribution maintainers.
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
# Download the key, convert the signing-key to a full
|
||||
# keyring required by apt and store in the keyring directory.
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
|
||||
2. Register ROCm packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu jammy main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm-graphics.list
|
||||
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||
| sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
1. Add the package signing key.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Make the directory if it doesn't exist yet.
|
||||
# This location is recommended by the distribution maintainers.
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
# Download the key, convert the signing-key to a full
|
||||
# keyring required by apt and store in the keyring directory.
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
|
||||
2. Register ROCm packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm.list
|
||||
|
||||
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu noble main" \
|
||||
| sudo tee /etc/apt/sources.list.d/rocm-graphics.list
|
||||
|
||||
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
|
||||
| sudo tee /etc/apt/preferences.d/rocm-pin-600
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo tee /etc/yum.repos.d/rocm.repo <<EOF
|
||||
[ROCm-7.0.0]
|
||||
name=ROCm7.0.0
|
||||
baseurl=https://repo.radeon.com/rocm/el9/7.0_alpha/main
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF
|
||||
|
||||
sudo tee /etc/yum.repos.d/rocm-graphics.repo <<EOF
|
||||
[ROCm-7.0.0-Graphics]
|
||||
name=ROCm7.0.0-Graphics
|
||||
baseurl=https://repo.radeon.com/graphics/7.0_alpha/rhel/9/main/x86_64/
|
||||
enabled=1
|
||||
priority=50
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF
|
||||
sudo dnf clean all
|
||||
|
||||
Install ROCm
|
||||
============
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install rocm
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt install rocm
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf install rocm
|
||||
|
||||
.. _uninstall-rocm:
|
||||
|
||||
Uninstalling
|
||||
============
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Ubuntu 22.04
|
||||
:sync: ubuntu-22
|
||||
|
||||
1. Uninstall specific meta packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove rocm
|
||||
|
||||
2. Uninstall ROCm packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove rocm-core
|
||||
|
||||
3. Remove ROCm repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/apt/sources.list.d/rocm*.list
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/apt/*
|
||||
sudo apt clean all
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: Ubuntu 24.04
|
||||
:sync: ubuntu-24
|
||||
|
||||
1. Uninstall specific meta packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove rocm
|
||||
|
||||
2. Uninstall ROCm packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo apt autoremove rocm-core
|
||||
|
||||
3. Remove ROCm repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/apt/sources.list.d/rocm*.list
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/apt/*
|
||||
sudo apt clean all
|
||||
sudo apt update
|
||||
|
||||
.. tab-item:: RHEL 9.6
|
||||
:sync: rhel-96
|
||||
|
||||
1. Uninstall specific meta packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf remove rocm
|
||||
|
||||
2. Uninstall ROCm packages.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo dnf remove rocm-core amdgpu-core
|
||||
|
||||
3. Remove ROCm repositories.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sudo rm /etc/yum.repos.d/rocm*.repo*
|
||||
# Clear the cache and clean the system
|
||||
sudo rm -rf /var/cache/dnf
|
||||
sudo dnf clean all
|
||||
270
docs/preview/release.rst
Normal file
270
docs/preview/release.rst
Normal file
@@ -0,0 +1,270 @@
|
||||
****************************
|
||||
ROCm 7.0 Alpha release notes
|
||||
****************************
|
||||
|
||||
The ROCm 7.0 Alpha is an early look into the upcoming ROCm 7.0 major release,
|
||||
which introduces functional support for AMD Instinct™ MI355X and MI350X
|
||||
on bare metal, single node systems. It also includes new features for current-generation
|
||||
MI300X, MI200, and MI100 series accelerators. This is an alpha-quality release;
|
||||
expect issues and limitations that will be addressed in upcoming previews.
|
||||
|
||||
.. important::
|
||||
|
||||
This Alpha release is not intended for performance evaluation.
|
||||
For the latest stable release for production-level functionality,
|
||||
see `ROCm documentation <https://rocm.docs.amd.com/en/latest/>`_.
|
||||
|
||||
This page provides a high-level summary of supported systems, key changes to the ROCm software
|
||||
stack, developments related to AI frameworks, current known limitations, and installation
|
||||
information.
|
||||
|
||||
.. _alpha-system-requirements:
|
||||
|
||||
Operating system and hardware support
|
||||
=====================================
|
||||
|
||||
Only the accelerators and operating systems listed here are supported. Multi-node systems,
|
||||
virtualized environments, and GPU partitioning are not supported in this Alpha.
|
||||
|
||||
* AMD accelerator: Instinct MI355X, MI350X, MI325X [#mi325x]_, MI300X, MI300A, MI250X, MI250, MI210, MI100
|
||||
* Operating system: Ubuntu 22.04, Ubuntu 24.04, or RHEL 9.6
|
||||
* System type: Bare metal, single node only
|
||||
* Partitioning: Not supported
|
||||
|
||||
.. [#mi325x] MI325X is only supported with Ubuntu 22.04.
|
||||
|
||||
.. _alpha-highlights:
|
||||
|
||||
Alpha release highlights
|
||||
========================
|
||||
|
||||
This section highlights key features enabled in the ROCm 7.0 Alpha.
|
||||
|
||||
AI frameworks
|
||||
-------------
|
||||
|
||||
PyTorch
|
||||
~~~~~~~
|
||||
|
||||
The ROCm 7.0 Alpha enables the following PyTorch features:
|
||||
|
||||
* Support for PyTorch 2.7
|
||||
|
||||
* Integrated Fused Rope kernels in APEX
|
||||
|
||||
* Compilation of Python C++ extensions using amdclang++
|
||||
|
||||
* Support for channels-last NHWC format for convolutions via MIOpen
|
||||
|
||||
TensorFlow
|
||||
~~~~~~~~~~
|
||||
|
||||
This Alpha enables support for TensorFlow 2.19.
|
||||
|
||||
vLLM
|
||||
~~~~
|
||||
|
||||
* Support for Open Compute Project (OCP) ``FP8`` data type
|
||||
|
||||
* ``FP4`` precision for Llama 3.1 405B
|
||||
|
||||
Libraries
|
||||
---------
|
||||
|
||||
.. _alpha-new-data-type-support:
|
||||
|
||||
New data type support
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
MX-compliant data types bring microscaling support to ROCm. For more information, see the `OCP
|
||||
Microscaling (MX) Formats Specification
|
||||
<https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf>`_. The ROCm
|
||||
7.0 Alpha enables functional support for MX data types ``FP4``, ``FP6``, and ``FP8`` on MI355X
|
||||
systems in these ROCm libraries:
|
||||
|
||||
* Composable Kernel (``FP4`` and ``FP8`` only)
|
||||
|
||||
* hipBLASLt
|
||||
|
||||
* MIGraphX (``FP4`` only)
|
||||
|
||||
The following libraries are updated to support the Open Compute Project (OCP) floating-point ``FP8``
|
||||
format on MI355X instead of the NANOO ``FP8`` format:
|
||||
|
||||
* Composable Kernel
|
||||
|
||||
* hipBLASLt
|
||||
|
||||
* hipSPARSELt
|
||||
|
||||
* MIGraphX
|
||||
|
||||
* rocWMMA
|
||||
|
||||
MIGraphX now also supports ``BF16``.
|
||||
|
||||
RCCL support
|
||||
~~~~~~~~~~~~
|
||||
|
||||
RCCL is supported for single-node functional usage only. Multi-node communication capabilities will
|
||||
be supported in future preview releases.
|
||||
|
||||
MIGraphX
|
||||
~~~~~~~~
|
||||
|
||||
* Support for OCP ``FP8`` and MX ``FP4`` data types on MI355X
|
||||
|
||||
* Support for ``BF16`` on all hardware
|
||||
|
||||
* Support for PyTorch 2.7 via Torch-MIGraphX
|
||||
|
||||
Tools
|
||||
-----
|
||||
|
||||
AMD SMI
|
||||
~~~~~~~
|
||||
|
||||
* The default output of the ``amd-smi`` CLI now displays a simple table view.
|
||||
|
||||
* New APIs: CPU affinity shows GPUs' affinitization to each CPU in a system.
|
||||
|
||||
ROCgdb
|
||||
~~~~~~
|
||||
|
||||
* MX data types support: ``FP4``, ``FP6``, and ``FP8``
|
||||
|
||||
ROCprof Compute Viewer
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Initial release: ``rocprof-compute-viewer`` allows the visualization of ``rocprofv3``'s thread
|
||||
trace output
|
||||
|
||||
ROCprof Trace Decoder
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Initial release: ``rocprof-trace-decoder`` a plugin API for decoding thread traces
|
||||
|
||||
ROCm Compute Profiler
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* MX data types support: ``FP4``, ``FP6``, and ``FP8``
|
||||
|
||||
* MI355X and MI350X performance counters: CPC, SPI, SQ, TA/TD/TCP, and TCC
|
||||
|
||||
* Enhanced roofline analysis with support for ``INT8``, ``INT32``, ``FP8``, ``FP16``, and ``BF16``
|
||||
data types
|
||||
|
||||
* Roofline distinction for ``FP32`` and ``FP64`` data types
|
||||
|
||||
* Selective kernel profiling
|
||||
|
||||
ROCm Systems Profiler
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Trace support for computer vision APIs: H264, H265, AV1, VP9, and JPEG
|
||||
|
||||
* Trace support for computer vision engine activity
|
||||
|
||||
* OpenMP for C++ language and kernel activity support
|
||||
|
||||
ROCm Validation Suite
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* MI355X and MI350X accelerator support in the IET (Integrated Execution Test), GST (GPU Stress Test), and Babel (memory bandwidth test) modules.
|
||||
|
||||
ROCprofiler-SDK
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
* Program counter (PC) sampling (host trap-based)
|
||||
|
||||
* API for profiling applications using thread traces (beta)
|
||||
|
||||
* Support in ``rocprofv3`` CLI tool for thread trace service
|
||||
|
||||
HIP
|
||||
---
|
||||
|
||||
The HIP runtime includes support for:
|
||||
|
||||
* Open Compute Project (OCP) MX floating-point ``FP4``, ``FP6``, and ``FP8`` data types and APIs
|
||||
|
||||
* Improved logging by adding more precise pointer information and launch arguments for better
|
||||
tracking and debugging in dispatch methods
|
||||
|
||||
In addition, the HIP runtime includes the following functional improvements which improve runtime
|
||||
performance and user experience:
|
||||
|
||||
* Optimized HIP runtime lock contention in some events and kernel handling APIs. Event processing
|
||||
and memory object look-ups now use the shared mutex implementation. Kernel object look-up during
|
||||
C++ kernel launch can now avoid a global lock. These changes improve performance in certain
|
||||
applications with high usage, particularly for multiple GPUs, multiple threads, and HIP streams
|
||||
per GPU.
|
||||
|
||||
* Programmatic support for scratch buffer limit on GPU device. Developers can now change the default
|
||||
allocation size with the expected scratch limit.
|
||||
|
||||
* Unified managed buffer and kernel argument buffers so the HIP runtime no longer needs to create
|
||||
and load a separate kernel argument buffer.
|
||||
|
||||
* Refactored memory validation to create a unique function to validate a variety of memory copy
|
||||
operations.
|
||||
|
||||
* Shader names are now demangled for more readable kernel logs
|
||||
|
||||
See :ref:`HIP compatibility <hip-known-limitation>`.
|
||||
|
||||
Compilers
|
||||
---------
|
||||
|
||||
* The compiler driver now uses parallel code generation by default when compiling using full LTO
|
||||
(including when using the ``-fgpu-rdc`` option) for HIP. This divides the optimized LLVM IR module
|
||||
into roughly equal partitions before instruction selection and lowering, which can help improve
|
||||
build times.
|
||||
|
||||
Each kernel in the linked LTO module may be put in a separate partition, and any non-inlined
|
||||
function it depends on may be copied alongside it. Thus, while parallel code generation can
|
||||
improve build time, it can duplicate non-inlined, non-kernel functions across multiple partitions,
|
||||
potentially increasing the binary size of the final object file.
|
||||
|
||||
* Compiler option ``-flto-partitions=<num>``.
|
||||
|
||||
Equivalent to the ``--lto-partitions=<num>`` LLD option. Controls the number of partitions used for
|
||||
parallel code generation when using full LTO (including when using ``-fgpu-rdc``). The number of
|
||||
partitions must be greater than 0, and a value of 1 disables the feature. The default value is 8.
|
||||
|
||||
Developers are encouraged to experiment with different numbers of partitions using the
|
||||
``-flto-partitions`` Clang command line option. Recommended values are 1 to 16 partitions, with
|
||||
especially large projects containing many kernels potentially benefitting from up to 64
|
||||
partitions. It is not recommended to use a value greater than the number of threads on the
|
||||
machine. Smaller projects, or projects that contain only a few kernels may also not benefit at
|
||||
all from partitioning and may even see a slight increase in build time due to the small overhead
|
||||
of analyzing and partitioning the modules.
|
||||
|
||||
* HIPIFY now supports NVIDIA CUDA 12.8.0 APIs. See
|
||||
`<https://github.com/ROCm/HIPIFY/blob/amd-develop/docs/reference/supported_apis.md>`_ for more
|
||||
information.
|
||||
|
||||
Instinct Driver / ROCm packaging separation
|
||||
-------------------------------------------
|
||||
|
||||
The Instinct Driver is now distributed separately from the ROCm software stack -- it is now stored
|
||||
in its own location in the package repository at `<repo.radeon.com>`_ under ``/amdgpu/``.
|
||||
The first release is designated as Instinct Driver version 30.10 See `ROCm Gets Modular: Meet the
|
||||
Instinct Datacenter GPU Driver
|
||||
<https://rocm.blogs.amd.com/ecosystems-and-partners/instinct-gpu-driver/README.html>`_ for more
|
||||
information.
|
||||
|
||||
Forward and backward compatibility between the Instinct Driver and ROCm are not supported in this
|
||||
Alpha release. See the :doc:`installation instructions <install/index>`.
|
||||
|
||||
Known limitations
|
||||
=================
|
||||
|
||||
.. _hip-known-limitation:
|
||||
|
||||
HIP compatibility
|
||||
-----------------
|
||||
|
||||
HIP runtime APIs in the ROCm 7.0 Alpha do not include backward-incompatible changes. See `HIP 7.0 Is
|
||||
Coming: What You Need to Know to Stay Ahead
|
||||
<https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0:-guidance-on-upcoming-compatibility-changes/README.html>`_ for more information.
|
||||
@@ -3,195 +3,206 @@
|
||||
defaults:
|
||||
numbered: False
|
||||
maxdepth: 6
|
||||
root: index
|
||||
root: preview/index
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: what-is-rocm.rst
|
||||
- file: about/release-notes.md
|
||||
title: Release notes
|
||||
- file: compatibility/compatibility-matrix.rst
|
||||
title: Compatibility matrix
|
||||
entries:
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html
|
||||
title: Linux system requirements
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
|
||||
title: Windows system requirements
|
||||
|
||||
- caption: Install
|
||||
entries:
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
|
||||
title: ROCm on Linux
|
||||
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
|
||||
title: HIP SDK on Windows
|
||||
- url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
|
||||
title: ROCm on Radeon GPUs
|
||||
- file: how-to/deep-learning-rocm.md
|
||||
title: Deep learning frameworks
|
||||
- file: how-to/build-rocm.rst
|
||||
title: Build ROCm from source
|
||||
|
||||
- caption: How to
|
||||
entries:
|
||||
- file: how-to/rocm-for-ai/index.rst
|
||||
title: Use ROCm for AI
|
||||
- file: preview/release.rst
|
||||
title: Alpha release notes
|
||||
- file: preview/install/index.rst
|
||||
title: Installation
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/install.rst
|
||||
title: Installation
|
||||
- file: how-to/rocm-for-ai/system-health-check.rst
|
||||
title: System health benchmarks
|
||||
- file: how-to/rocm-for-ai/training/index.rst
|
||||
title: Training
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
||||
title: Train a model with Megatron-LM
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||
title: Train a model with PyTorch
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||
title: Train a model with JAX MaxText
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||
title: Train a model with LLM Foundry
|
||||
- file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
title: Scale model training
|
||||
|
||||
- file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
title: Fine-tuning LLMs
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/fine-tuning/overview.rst
|
||||
title: Conceptual overview
|
||||
- file: how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
|
||||
title: Fine-tuning
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
|
||||
title: Use a single accelerator
|
||||
- file: how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
|
||||
title: Use multiple accelerators
|
||||
|
||||
- file: how-to/rocm-for-ai/inference/index.rst
|
||||
title: Inference
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/inference/hugging-face-models.rst
|
||||
title: Run models from Hugging Face
|
||||
- file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
|
||||
title: LLM inference frameworks
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
|
||||
title: vLLM inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
|
||||
title: PyTorch inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||
title: Deploy your model
|
||||
|
||||
- file: how-to/rocm-for-ai/inference-optimization/index.rst
|
||||
title: Inference optimization
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/inference-optimization/model-quantization.rst
|
||||
- file: how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
|
||||
- file: how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
|
||||
title: Optimize with Composable Kernel
|
||||
- file: how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel.rst
|
||||
title: Optimize Triton kernels
|
||||
- file: how-to/rocm-for-ai/inference-optimization/profiling-and-debugging.rst
|
||||
title: Profile and debug
|
||||
- file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||
title: Workload optimization
|
||||
|
||||
- url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||
title: AI tutorials
|
||||
|
||||
- file: how-to/rocm-for-hpc/index.rst
|
||||
title: Use ROCm for HPC
|
||||
- file: how-to/system-optimization/index.rst
|
||||
title: System optimization
|
||||
- file: how-to/gpu-performance/mi300x.rst
|
||||
title: AMD Instinct MI300X performance guides
|
||||
- file: how-to/system-debugging.md
|
||||
- file: conceptual/compiler-topics.md
|
||||
title: Use advanced compiler features
|
||||
subtrees:
|
||||
- entries:
|
||||
- url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html
|
||||
title: ROCm compiler infrastructure
|
||||
- url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html
|
||||
title: Use AddressSanitizer
|
||||
- url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html
|
||||
title: OpenMP support
|
||||
- file: how-to/setting-cus
|
||||
title: Set the number of CUs
|
||||
- file: how-to/Bar-Memory.rst
|
||||
title: Troubleshoot BAR access limitation
|
||||
- url: https://github.com/amd/rocm-examples
|
||||
title: ROCm examples
|
||||
|
||||
|
||||
- caption: Conceptual
|
||||
entries:
|
||||
- file: conceptual/gpu-arch.md
|
||||
title: GPU architecture overview
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: conceptual/gpu-arch/mi300.md
|
||||
title: MI300 microarchitecture
|
||||
subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
title: AMD Instinct MI300/CDNA3 ISA
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
||||
title: MI300 and MI200 Performance counter
|
||||
- file: conceptual/gpu-arch/mi250.md
|
||||
title: MI250 microarchitecture
|
||||
subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
|
||||
title: AMD Instinct MI200/CDNA2 ISA
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/gpu-arch/mi100.md
|
||||
title: MI100 microarchitecture
|
||||
subtrees:
|
||||
- entries:
|
||||
- url: https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf
|
||||
title: AMD Instinct MI100/CDNA1 ISA
|
||||
- url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
|
||||
title: White paper
|
||||
- file: conceptual/file-reorg.md
|
||||
title: File structure (Linux FHS)
|
||||
- file: conceptual/gpu-isolation.md
|
||||
title: GPU isolation techniques
|
||||
- file: conceptual/cmake-packages.rst
|
||||
title: Using CMake
|
||||
- file: conceptual/ai-pytorch-inception.md
|
||||
title: Inception v3 with PyTorch
|
||||
|
||||
- caption: Reference
|
||||
entries:
|
||||
- file: reference/api-libraries.md
|
||||
title: ROCm libraries
|
||||
- file: reference/rocm-tools.md
|
||||
title: ROCm tools, compilers, and runtimes
|
||||
- file: reference/gpu-arch-specs.rst
|
||||
- file: reference/gpu-atomics-operation.rst
|
||||
- file: reference/precision-support.rst
|
||||
title: Precision support
|
||||
- file: reference/graph-safe-support.rst
|
||||
title: Graph safe support
|
||||
|
||||
- caption: Contribute
|
||||
entries:
|
||||
- file: contribute/contributing.md
|
||||
title: Contributing to the ROCm documentation
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: contribute/toolchain.md
|
||||
title: ROCm documentation toolchain
|
||||
- file: contribute/building.md
|
||||
- file: contribute/feedback.md
|
||||
title: Providing feedback about the ROCm documentation
|
||||
- file: about/license.md
|
||||
title: ROCm licenses
|
||||
- file: preview/install/rocm
|
||||
title: Install ROCm
|
||||
- file: preview/install/instinct-driver
|
||||
title: Install Instinct Driver
|
||||
# - entries:
|
||||
# - file: what-is-rocm.rst
|
||||
# - file: about/release-notes.md
|
||||
# title: Release notes
|
||||
# - file: compatibility/compatibility-matrix.rst
|
||||
# title: Compatibility matrix
|
||||
# entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/reference/system-requirements.html
|
||||
# title: Linux system requirements
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
|
||||
# title: Windows system requirements
|
||||
#
|
||||
# - caption: Install
|
||||
# entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/
|
||||
# title: ROCm on Linux
|
||||
# - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/
|
||||
# title: HIP SDK on Windows
|
||||
# - url: https://rocm.docs.amd.com/projects/radeon/en/latest/index.html
|
||||
# title: ROCm on Radeon GPUs
|
||||
# - file: how-to/deep-learning-rocm.md
|
||||
# title: Deep learning frameworks
|
||||
# - file: how-to/build-rocm.rst
|
||||
# title: Build ROCm from source
|
||||
#
|
||||
# - caption: How to
|
||||
# entries:
|
||||
# - file: how-to/rocm-for-ai/index.rst
|
||||
# title: Use ROCm for AI
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/install.rst
|
||||
# title: Installation
|
||||
# - file: how-to/rocm-for-ai/system-health-check.rst
|
||||
# title: System health benchmarks
|
||||
# - file: how-to/rocm-for-ai/training/index.rst
|
||||
# title: Training
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
||||
# title: Train a model with Megatron-LM
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||
# title: Train a model with PyTorch
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||
# title: Train a model with JAX MaxText
|
||||
# - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||
# title: Train a model with LLM Foundry
|
||||
# - file: how-to/rocm-for-ai/training/scale-model-training.rst
|
||||
# title: Scale model training
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/index.rst
|
||||
# title: Fine-tuning LLMs
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/overview.rst
|
||||
# title: Conceptual overview
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
|
||||
# title: Fine-tuning
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
|
||||
# title: Use a single accelerator
|
||||
# - file: how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
|
||||
# title: Use multiple accelerators
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/inference/index.rst
|
||||
# title: Inference
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/inference/hugging-face-models.rst
|
||||
# title: Run models from Hugging Face
|
||||
# - file: how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
|
||||
# title: LLM inference frameworks
|
||||
# - file: how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
|
||||
# title: vLLM inference performance testing
|
||||
# - file: how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
|
||||
# title: PyTorch inference performance testing
|
||||
# - file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||
# title: Deploy your model
|
||||
#
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/index.rst
|
||||
# title: Inference optimization
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/model-quantization.rst
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
|
||||
# title: Optimize with Composable Kernel
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel.rst
|
||||
# title: Optimize Triton kernels
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/profiling-and-debugging.rst
|
||||
# title: Profile and debug
|
||||
# - file: how-to/rocm-for-ai/inference-optimization/workload.rst
|
||||
# title: Workload optimization
|
||||
#
|
||||
# - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
|
||||
# title: AI tutorials
|
||||
#
|
||||
# - file: how-to/rocm-for-hpc/index.rst
|
||||
# title: Use ROCm for HPC
|
||||
# - file: how-to/system-optimization/index.rst
|
||||
# title: System optimization
|
||||
# - file: how-to/gpu-performance/mi300x.rst
|
||||
# title: AMD Instinct MI300X performance guides
|
||||
# - file: how-to/system-debugging.md
|
||||
# - file: conceptual/compiler-topics.md
|
||||
# title: Use advanced compiler features
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/index.html
|
||||
# title: ROCm compiler infrastructure
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/using-gpu-sanitizer.html
|
||||
# title: Use AddressSanitizer
|
||||
# - url: https://rocm.docs.amd.com/projects/llvm-project/en/latest/conceptual/openmp.html
|
||||
# title: OpenMP support
|
||||
# - file: how-to/setting-cus
|
||||
# title: Set the number of CUs
|
||||
# - file: how-to/Bar-Memory.rst
|
||||
# title: Troubleshoot BAR access limitation
|
||||
# - url: https://github.com/amd/rocm-examples
|
||||
# title: ROCm examples
|
||||
#
|
||||
#
|
||||
# - caption: Conceptual
|
||||
# entries:
|
||||
# - file: conceptual/gpu-arch.md
|
||||
# title: GPU architecture overview
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: conceptual/gpu-arch/mi300.md
|
||||
# title: MI300 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
|
||||
# title: AMD Instinct MI300/CDNA3 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/gpu-arch/mi300-mi200-performance-counters.rst
|
||||
# title: MI300 and MI200 Performance counter
|
||||
# - file: conceptual/gpu-arch/mi250.md
|
||||
# title: MI250 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf
|
||||
# title: AMD Instinct MI200/CDNA2 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/gpu-arch/mi100.md
|
||||
# title: MI100 microarchitecture
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - url: https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf
|
||||
# title: AMD Instinct MI100/CDNA1 ISA
|
||||
# - url: https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
|
||||
# title: White paper
|
||||
# - file: conceptual/file-reorg.md
|
||||
# title: File structure (Linux FHS)
|
||||
# - file: conceptual/gpu-isolation.md
|
||||
# title: GPU isolation techniques
|
||||
# - file: conceptual/cmake-packages.rst
|
||||
# title: Using CMake
|
||||
# - file: conceptual/ai-pytorch-inception.md
|
||||
# title: Inception v3 with PyTorch
|
||||
#
|
||||
# - caption: Reference
|
||||
# entries:
|
||||
# - file: reference/api-libraries.md
|
||||
# title: ROCm libraries
|
||||
# - file: reference/rocm-tools.md
|
||||
# title: ROCm tools, compilers, and runtimes
|
||||
# - file: reference/gpu-arch-specs.rst
|
||||
# - file: reference/gpu-atomics-operation.rst
|
||||
# - file: reference/precision-support.rst
|
||||
# title: Precision support
|
||||
# - file: reference/graph-safe-support.rst
|
||||
# title: Graph safe support
|
||||
#
|
||||
# - caption: Contribute
|
||||
# entries:
|
||||
# - file: contribute/contributing.md
|
||||
# title: Contributing to the ROCm documentation
|
||||
# subtrees:
|
||||
# - entries:
|
||||
# - file: contribute/toolchain.md
|
||||
# title: ROCm documentation toolchain
|
||||
# - file: contribute/building.md
|
||||
# - file: contribute/feedback.md
|
||||
# title: Providing feedback about the ROCm documentation
|
||||
# - file: about/license.md
|
||||
# title: ROCm licenses
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.20.1
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
git+https://github.com/ROCm/rocm-docs-core.git@alexxu12/header-cap-space#egg=rocm-docs-core
|
||||
|
||||
@@ -21,9 +21,11 @@ babel==2.17.0
|
||||
# sphinx
|
||||
beautifulsoup4==4.13.4
|
||||
# via pydata-sphinx-theme
|
||||
blinker==1.9.0
|
||||
# via flask
|
||||
breathe==4.36.0
|
||||
# via rocm-docs-core
|
||||
certifi==2025.4.26
|
||||
certifi==2025.6.15
|
||||
# via requests
|
||||
cffi==1.17.1
|
||||
# via
|
||||
@@ -33,11 +35,12 @@ charset-normalizer==3.4.2
|
||||
# via requests
|
||||
click==8.2.1
|
||||
# via
|
||||
# flask
|
||||
# jupyter-cache
|
||||
# sphinx-external-toc
|
||||
comm==0.2.2
|
||||
# via ipykernel
|
||||
cryptography==45.0.3
|
||||
cryptography==45.0.4
|
||||
# via pyjwt
|
||||
debugpy==1.8.14
|
||||
# via ipykernel
|
||||
@@ -60,6 +63,8 @@ fastjsonschema==2.21.1
|
||||
# via
|
||||
# nbformat
|
||||
# rocm-docs-core
|
||||
flask==3.1.1
|
||||
# via sphinx-sitemap
|
||||
gitdb==4.0.12
|
||||
# via gitpython
|
||||
gitpython==3.1.44
|
||||
@@ -80,10 +85,13 @@ ipython==8.37.0
|
||||
# via
|
||||
# ipykernel
|
||||
# myst-nb
|
||||
itsdangerous==2.2.0
|
||||
# via flask
|
||||
jedi==0.19.2
|
||||
# via ipython
|
||||
jinja2==3.1.6
|
||||
# via
|
||||
# flask
|
||||
# myst-parser
|
||||
# sphinx
|
||||
jsonschema==4.24.0
|
||||
@@ -107,7 +115,10 @@ markdown-it-py==3.0.0
|
||||
# mdit-py-plugins
|
||||
# myst-parser
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
# via
|
||||
# flask
|
||||
# jinja2
|
||||
# werkzeug
|
||||
matplotlib-inline==0.1.7
|
||||
# via
|
||||
# ipykernel
|
||||
@@ -134,7 +145,6 @@ nest-asyncio==1.6.0
|
||||
packaging==25.0
|
||||
# via
|
||||
# ipykernel
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
parso==0.8.4
|
||||
# via jedi
|
||||
@@ -152,13 +162,13 @@ pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.15.4
|
||||
pydata-sphinx-theme==0.16.1
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
pygithub==2.6.1
|
||||
# via rocm-docs-core
|
||||
pygments==2.19.1
|
||||
pygments==2.19.2
|
||||
# via
|
||||
# accessible-pygments
|
||||
# ipython
|
||||
@@ -178,7 +188,7 @@ pyyaml==6.0.2
|
||||
# rocm-docs-core
|
||||
# sphinx-external-toc
|
||||
# sphinxcontrib-datatemplates
|
||||
pyzmq==26.4.0
|
||||
pyzmq==27.0.0
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@@ -190,7 +200,8 @@ requests==2.32.4
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.20.1
|
||||
# sphinx-sitemap
|
||||
rocm-docs-core @ git+https://github.com/ROCm/rocm-docs-core.git@alexxu12/header-cap-space
|
||||
# via -r requirements.in
|
||||
rpds-py==0.25.1
|
||||
# via
|
||||
@@ -215,12 +226,12 @@ sphinx==8.1.3
|
||||
# sphinx-copybutton
|
||||
# sphinx-design
|
||||
# sphinx-external-toc
|
||||
# sphinx-last-updated-by-git
|
||||
# sphinx-notfound-page
|
||||
# sphinx-reredirects
|
||||
# sphinx-sitemap
|
||||
# sphinxcontrib-datatemplates
|
||||
# sphinxcontrib-runcmd
|
||||
sphinx-book-theme==1.1.4
|
||||
sphinx-book-theme==1.1.3
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.2
|
||||
# via rocm-docs-core
|
||||
@@ -228,11 +239,13 @@ sphinx-design==0.6.1
|
||||
# via rocm-docs-core
|
||||
sphinx-external-toc==1.0.1
|
||||
# via rocm-docs-core
|
||||
sphinx-last-updated-by-git==0.3.8
|
||||
# via sphinx-sitemap
|
||||
sphinx-notfound-page==1.1.0
|
||||
# via rocm-docs-core
|
||||
sphinx-reredirects==0.1.6
|
||||
# via -r requirements.in
|
||||
sphinx-sitemap==2.6.0
|
||||
sphinx-sitemap==2.7.1
|
||||
# via -r requirements.in
|
||||
sphinxcontrib-applehelp==2.0.0
|
||||
# via sphinx
|
||||
@@ -288,6 +301,8 @@ urllib3==2.5.0
|
||||
# requests
|
||||
wcwidth==0.2.13
|
||||
# via prompt-toolkit
|
||||
werkzeug==3.1.3
|
||||
# via flask
|
||||
wrapt==1.17.2
|
||||
# via deprecated
|
||||
zipp==3.23.0
|
||||
|
||||
Reference in New Issue
Block a user