mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 14:48:06 -05:00
Compare commits
42 Commits
docs/7.1.1
...
develop
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
773f5de407 | ||
|
|
b297ced032 | ||
|
|
2dc22ca890 | ||
|
|
85102079ed | ||
|
|
ba95e0e689 | ||
|
|
1691d369e9 | ||
|
|
172b0f7c08 | ||
|
|
c67fac78bd | ||
|
|
e0b8ec4dfb | ||
|
|
38f2d043dc | ||
|
|
3a43bacdda | ||
|
|
48d8fe139b | ||
|
|
7455fe57b8 | ||
|
|
52c0a47e84 | ||
|
|
cbab9a465d | ||
|
|
459283da3c | ||
|
|
1b4f25733d | ||
|
|
b287372be5 | ||
|
|
78e8baf147 | ||
|
|
3e0c8b47e3 | ||
|
|
c3f0b99cc0 | ||
|
|
c9d1679486 | ||
|
|
fdbef17d7b | ||
|
|
6592a41a7f | ||
|
|
65a936023b | ||
|
|
2a64949081 | ||
|
|
0a17434517 | ||
|
|
2be7e5ac1e | ||
|
|
ae80c4a31c | ||
|
|
dd89a692e1 | ||
|
|
bf74351e5a | ||
|
|
f2067767e0 | ||
|
|
effd4174fb | ||
|
|
453751a86f | ||
|
|
fb644412d5 | ||
|
|
e8fdc34b71 | ||
|
|
b4031ef23c | ||
|
|
d0bd4e6f03 | ||
|
|
0056b9453e | ||
|
|
3d1ad79766 | ||
|
|
8683bed11b | ||
|
|
847cd7c423 |
@@ -34,6 +34,7 @@ parameters:
|
||||
default:
|
||||
- cmake
|
||||
- libnuma-dev
|
||||
- libsimde-dev
|
||||
- mesa-common-dev
|
||||
- ninja-build
|
||||
- ocl-icd-libopencl1
|
||||
|
||||
@@ -39,6 +39,7 @@ parameters:
|
||||
- python3
|
||||
- python3-dev
|
||||
- python3-pip
|
||||
- python3-venv
|
||||
- libgtest-dev
|
||||
- libboost-filesystem-dev
|
||||
- libboost-program-options-dev
|
||||
@@ -46,6 +47,8 @@ parameters:
|
||||
type: object
|
||||
default:
|
||||
- nanobind>=2.0.0
|
||||
- pytest
|
||||
- pytest-cov
|
||||
- name: rocmDependencies
|
||||
type: object
|
||||
default:
|
||||
@@ -72,8 +75,10 @@ parameters:
|
||||
- { os: ubuntu2204, packageManager: apt }
|
||||
- { os: almalinux8, packageManager: dnf }
|
||||
testJobs:
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx942 }
|
||||
- { os: ubuntu2204, packageManager: apt, target: gfx90a }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
|
||||
# - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
|
||||
- name: downstreamComponentMatrix
|
||||
type: object
|
||||
default:
|
||||
@@ -116,6 +121,11 @@ jobs:
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- catch2
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
@@ -137,6 +147,7 @@ jobs:
|
||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||
-DORIGAMI_ENABLE_PYTHON=ON
|
||||
-DORIGAMI_BUILD_TESTING=ON
|
||||
-DORIGAMI_ENABLE_FETCH=ON
|
||||
-GNinja
|
||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||
- task: PublishPipelineArtifact@1
|
||||
@@ -169,7 +180,6 @@ jobs:
|
||||
dependsOn: origami_build_${{ job.os }}
|
||||
condition:
|
||||
and(succeeded(),
|
||||
eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
|
||||
not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
|
||||
eq(${{ parameters.aggregatePipeline }}, False)
|
||||
)
|
||||
@@ -180,30 +190,30 @@ jobs:
|
||||
workspace:
|
||||
clean: all
|
||||
steps:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
pipModules: ${{ parameters.pipModules }}
|
||||
packageManager: ${{ job.packageManager }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
|
||||
parameters:
|
||||
checkoutRepo: ${{ parameters.checkoutRepo }}
|
||||
sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- gtest
|
||||
- ${{ if ne(job.os, 'almalinux8') }}:
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
|
||||
parameters:
|
||||
dependencyList:
|
||||
- catch2
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
|
||||
parameters:
|
||||
preTargetFilter: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: 'Download Build Directory Artifact'
|
||||
inputs:
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
|
||||
path: '$(Agent.BuildDirectory)/s/build'
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: 'Download Python Source Artifact'
|
||||
inputs:
|
||||
artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
|
||||
path: '$(Agent.BuildDirectory)/s/python'
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
|
||||
parameters:
|
||||
checkoutRef: ${{ parameters.checkoutRef }}
|
||||
@@ -212,25 +222,72 @@ jobs:
|
||||
gpuTarget: ${{ job.target }}
|
||||
${{ if parameters.triggerDownstreamJobs }}:
|
||||
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
|
||||
- task: CMake@1
|
||||
displayName: 'Origami Test CMake Configuration'
|
||||
inputs:
|
||||
cmakeArgs: >-
|
||||
-DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
|
||||
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
-DORIGAMI_BUILD_SHARED_LIBS=ON
|
||||
-DORIGAMI_ENABLE_PYTHON=ON
|
||||
-DORIGAMI_BUILD_TESTING=ON
|
||||
-GNinja
|
||||
$(Agent.BuildDirectory)/s
|
||||
- task: Bash@3
|
||||
displayName: 'Build Origami Tests and Python Bindings'
|
||||
inputs:
|
||||
targetType: inline
|
||||
workingDirectory: build
|
||||
script: |
|
||||
cmake --build . --target origami-tests origami_python -- -j$(nproc)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
|
||||
# Run tests using CTest (discovers and runs both C++ and Python tests)
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
|
||||
parameters:
|
||||
componentName: ${{ parameters.componentName }}
|
||||
os: ${{ job.os }}
|
||||
testDir: '$(Agent.BuildDirectory)/rocm/bin'
|
||||
testExecutable: './origami-tests'
|
||||
testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
|
||||
- script: |
|
||||
set -e
|
||||
export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
|
||||
|
||||
echo "--- Running origami_test.py ---"
|
||||
python3 $(Agent.BuildDirectory)/s/python/origami_test.py
|
||||
|
||||
echo "--- Running origami_grid_test.py ---"
|
||||
python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
|
||||
displayName: 'Run Python Binding Tests'
|
||||
condition: succeeded()
|
||||
testDir: 'build'
|
||||
testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
|
||||
# Test pip install workflow
|
||||
# - task: Bash@3
|
||||
# displayName: 'Test Pip Install'
|
||||
# inputs:
|
||||
# targetType: inline
|
||||
# script: |
|
||||
# set -e
|
||||
|
||||
# echo "==================================================================="
|
||||
# echo "Testing pip install workflow (pip install -e .)"
|
||||
# echo "==================================================================="
|
||||
|
||||
# # Set environment variables for pip install CMake build
|
||||
# export ROCM_PATH=$(Agent.BuildDirectory)/rocm
|
||||
# export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
|
||||
# export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
|
||||
|
||||
# echo "ROCM_PATH: $ROCM_PATH"
|
||||
# echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
|
||||
# echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
|
||||
# echo ""
|
||||
|
||||
# # Install from source directory
|
||||
# cd "$(Agent.BuildDirectory)/s/python"
|
||||
# pip install -e .
|
||||
|
||||
# # Verify import works
|
||||
# echo ""
|
||||
# echo "Verifying origami can be imported..."
|
||||
# python3 -c "import origami; print('✓ Successfully imported origami')"
|
||||
|
||||
# # Run pytest on installed package
|
||||
# echo ""
|
||||
# echo "Running pytest tests..."
|
||||
# python3 -m pytest tests/ -v -m "not slow" --tb=short
|
||||
|
||||
# echo ""
|
||||
# echo "==================================================================="
|
||||
# echo "Pip install test completed successfully"
|
||||
# echo "==================================================================="
|
||||
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
|
||||
parameters:
|
||||
aptPackages: ${{ parameters.aptPackages }}
|
||||
|
||||
@@ -30,6 +30,7 @@ parameters:
|
||||
- python3-pip
|
||||
- protobuf-compiler
|
||||
- libprotoc-dev
|
||||
- libopencv-dev
|
||||
- name: pipModules
|
||||
type: object
|
||||
default:
|
||||
@@ -64,6 +65,7 @@ parameters:
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocAL
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
@@ -103,6 +105,7 @@ parameters:
|
||||
- MIVisionX
|
||||
- rocm_smi_lib
|
||||
- rccl
|
||||
- rocAL
|
||||
- rocALUTION
|
||||
- rocBLAS
|
||||
- rocDecode
|
||||
|
||||
@@ -138,6 +138,7 @@ ESXi
|
||||
EP
|
||||
EoS
|
||||
etcd
|
||||
equalto
|
||||
fas
|
||||
FBGEMM
|
||||
FiLM
|
||||
@@ -226,6 +227,8 @@ href
|
||||
Hyperparameters
|
||||
HybridEngine
|
||||
Huggingface
|
||||
Hunyuan
|
||||
HunyuanVideo
|
||||
IB
|
||||
ICD
|
||||
ICT
|
||||
@@ -258,6 +261,7 @@ Ioffe
|
||||
JAX's
|
||||
JAXLIB
|
||||
Jinja
|
||||
js
|
||||
JSON
|
||||
Jupyter
|
||||
KFD
|
||||
@@ -517,13 +521,12 @@ TPS
|
||||
TPU
|
||||
TPUs
|
||||
TSME
|
||||
Taichi
|
||||
Taichi's
|
||||
Tagram
|
||||
TensileLite
|
||||
TensorBoard
|
||||
TensorFlow
|
||||
TensorParallel
|
||||
TheRock
|
||||
ToC
|
||||
TorchAudio
|
||||
torchaudio
|
||||
@@ -541,6 +544,7 @@ UAC
|
||||
UC
|
||||
UCC
|
||||
UCX
|
||||
ud
|
||||
UE
|
||||
UIF
|
||||
UMC
|
||||
@@ -852,6 +856,7 @@ pallas
|
||||
parallelization
|
||||
parallelizing
|
||||
param
|
||||
params
|
||||
parameterization
|
||||
passthrough
|
||||
pe
|
||||
@@ -898,6 +903,7 @@ querySelectorAll
|
||||
queueing
|
||||
qwen
|
||||
radeon
|
||||
rc
|
||||
rccl
|
||||
rdc
|
||||
rdma
|
||||
@@ -959,6 +965,7 @@ scalability
|
||||
scalable
|
||||
scipy
|
||||
seealso
|
||||
selectattr
|
||||
selectedTag
|
||||
sendmsg
|
||||
seqs
|
||||
@@ -1062,6 +1069,8 @@ writebacks
|
||||
wrreq
|
||||
wzo
|
||||
xargs
|
||||
xdit
|
||||
xDiT
|
||||
xGMI
|
||||
xPacked
|
||||
xz
|
||||
|
||||
12
CHANGELOG.md
12
CHANGELOG.md
@@ -39,7 +39,11 @@ for a complete overview of this release.
|
||||
- VMs were incorrectly reporting `AMDSMI_STATUS_API_FAILED` when unable to get the power cap within the `amdsmi_get_power_info`.
|
||||
- The API now returns `N/A` or `UINT_MAX` for values that can't be retrieved, instead of failing.
|
||||
|
||||
- Fixed output for `amd-smi xgmi -l --json`.
|
||||
- Fixed output for `amd-smi xgmi -l --json`.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-711) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
|
||||
@@ -681,7 +685,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
|
||||
* `Compute Throughput` panel to TUI's `High Level Analysis` category with the following metrics: VALU FLOPs, VALU IOPs, MFMA FLOPs (F8), MFMA FLOPs (BF16), MFMA FLOPs (F16), MFMA FLOPs (F32), MFMA FLOPs (F64), MFMA FLOPs (F6F4) (in gfx950), MFMA IOPs (Int8), SALU Utilization, VALU Utilization, MFMA Utilization, VMEM Utilization, Branch Utilization, IPC
|
||||
|
||||
* `Memory Throughput` panel to TUI's `High Level Analysis` category with the following metrics: vL1D Cache BW, vL1D Cache Utilization, Theoretical LDS Bandwidth, LDS Utilization, L2 Cache BW, L2 Cache Utilization, L2-Fabric Read BW, L2-Fabric Write BW, sL1D Cache BW, L1I BW, Address Processing Unit Busy, Data-Return Busy, L1I-L2 Bandwidth, sL1D-L2 BW
|
||||
* Roofline support for Debian 12 and Azure Linux 3.0.
|
||||
* Roofline support for Debian 12.
|
||||
* Notice for change in default output format to `rocpd` in a future release
|
||||
* This is displayed when `--format-rocprof-output rocpd` is not used in profile mode
|
||||
|
||||
@@ -1730,8 +1734,8 @@ HIP runtime has the following functional improvements which improves runtime per
|
||||
|
||||
#### Upcoming changes
|
||||
|
||||
* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](#amdgpu-wavefront-size-compiler-macro-deprecation).
|
||||
* The `roc-obj-ls` and `roc-obj-extract` tools are deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](#changes-to-rocm-object-tooling).
|
||||
* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#amdgpu-wavefront-size-compiler-macro-deprecation).
|
||||
* The `roc-obj-ls` and `roc-obj-extract` tools are deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#changes-to-rocm-object-tooling).
|
||||
|
||||
### **MIGraphX** (2.13.0)
|
||||
|
||||
|
||||
38
RELEASE.md
38
RELEASE.md
@@ -100,12 +100,13 @@ firmware, AMD GPU drivers, and the ROCm user space software.
|
||||
01.25.16.03<br>
|
||||
01.25.15.04
|
||||
</td>
|
||||
<td rowspan="2" style="vertical-align: middle;">
|
||||
<td>
|
||||
30.20.1<br>
|
||||
30.20.0<br>
|
||||
30.10.2<br>
|
||||
30.10.1<br>
|
||||
30.10</td>
|
||||
30.10
|
||||
</td>
|
||||
<td rowspan="3" style="vertical-align: middle;">8.6.0.K</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@@ -114,6 +115,13 @@ firmware, AMD GPU drivers, and the ROCm user space software.
|
||||
01.25.16.03<br>
|
||||
01.25.15.04
|
||||
</td>
|
||||
<td>
|
||||
30.20.1<br>
|
||||
30.20.0<br>
|
||||
30.10.2<br>
|
||||
30.10.1<br>
|
||||
30.10
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>MI325X<a href="#footnote1"><sup>[1]</sup></a></td>
|
||||
@@ -674,7 +682,7 @@ For a historical overview of ROCm component updates, see the {doc}`ROCm consolid
|
||||
- Fixed output for `amd-smi xgmi -l --json`.
|
||||
|
||||
```{note}
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-710) for details, examples, and in-depth descriptions.
|
||||
See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-7.1/CHANGELOG.md#amd_smi_lib-for-rocm-711) for details, examples, and in-depth descriptions.
|
||||
```
|
||||
|
||||
### **Composable Kernel** (1.1.0)
|
||||
@@ -831,7 +839,7 @@ issues related to individual components, review the [Detailed component changes]
|
||||
|
||||
### RCCL performance degradation on AMD Instinct MI300X GPU with AMD Pollara AI NIC
|
||||
|
||||
If you’re using RCCL on AMD Instinct MI300X GPUs with AMD Pollara AI NIC, you might observe performance degradation for specific collectives and message sizes. The affected collectives are `Scatter`, `AllToAll`, and `AlltoAllv`. It's recommended to avoid using RCCL packaged with ROCm 7.1.1. As a workaround, use the {fab}`github`[RCCL `develop` branch](https://github.com/ROCm/rccl/tree/develop), which contains the fix and will be included in a future ROCm release.
|
||||
If you’re using RCCL on AMD Instinct MI300X GPUs with AMD Pollara AI NIC, you might observe performance degradation for specific collectives and message sizes. The affected collectives are `Scatter`, `AllToAll`, and `AlltoAllv`. It's recommended to avoid using RCCL packaged with ROCm 7.1.1. As a workaround, use the {fab}`github`[RCCL `develop` branch](https://github.com/ROCm/rccl/tree/develop), which contains the fix and will be included in a future ROCm release. See [GitHub issue #5717](https://github.com/ROCm/ROCm/issues/5717).
|
||||
|
||||
### Segmentation fault in training models using TensorFlow 2.20.0 Docker images
|
||||
|
||||
@@ -839,7 +847,7 @@ Training models `tf2_tfm_resnet50_fp16_train` and `tf2_tfm_resnet50_fp32_train`
|
||||
might fail with a segmentation fault when run on the TensorFlow 2.20.0 Docker
|
||||
image with ROCm 7.1.1. As a workaround, use TensorFlow 2.19.x Docker image for
|
||||
training the models in ROCm 7.1.1. This issue will be fixed in a future ROCm
|
||||
release.
|
||||
release. See [GitHub issue #5718](https://github.com/ROCm/ROCm/issues/5718).
|
||||
|
||||
### AMD SMI CLI triggers repeated kernel errors on GPUs with partitioning support
|
||||
|
||||
@@ -858,27 +866,19 @@ amdgpu 0000:15:00.0: amdgpu: renderD153 partition 1 not valid!
|
||||
These repeated kernel logs can clutter the system logs and may cause
|
||||
unnecessary concern about GPU health. However, this is a non-functional issue
|
||||
and does not affect AMD SMI functionality or GPU performance. This issue will
|
||||
be fixed in a future ROCm release.
|
||||
be fixed in a future ROCm release. See [GitHub issue #5720](https://github.com/ROCm/ROCm/issues/5720).
|
||||
|
||||
### Excessive bad page logs in AMD GPU Driver (amdgpu)
|
||||
|
||||
Due to partial data corruption of Electrically Erasable Programmable Read-Only Memory (EEPROM) and limited error handling in the AMD GPU Driver(amdgpu), excessive log output might result when querying the reliability, availability, and serviceability (RAS) bad pages. This issue will be fixed in a future AMD GPU Driver(amdgpu) and ROCm release.
|
||||
Due to partial data corruption in the Electrically Erasable Programmable Read-Only Memory (EEPROM) and limited error handling in the AMD GPU Driver (amdgpu), excessive log output might occur when querying the reliability, availability, and serviceability (RAS) bad pages. This issue will be fixed in a future AMD GPU Driver (amdgpu) and ROCm release. See [GitHub issue #5719](https://github.com/ROCm/ROCm/issues/5719).
|
||||
|
||||
### OpenBLAS runtime dependency for hipblastlt-test and hipblaslt-bench
|
||||
### Incorrect results in gemm_ex operations for rocBLAS and hipBLAS
|
||||
|
||||
Running `hipblaslt-test` or `hipblaslt-bench` without installing the OpenBLAS development package results in the following error:
|
||||
```
|
||||
libopenblas.so.0: cannot open shared object file: No such file or directory
|
||||
```
|
||||
As a workaround, first install `libopenblas-dev` or `libopenblas-deve`, depending on the package manager used. The issue will be fixed in a future ROCm release. See [GitHub issue #5639](https://github.com/ROCm/ROCm/issues/5639).
|
||||
Some `gemm_ex` operations with 8-bit input data types (`int8`, `float8`, `bfloat8`) for specific matrix dimensions (K = 1 and number of workgroups > 1) might yield incorrect results. The issue results from incorrect tailloop code that fails to consider workgroup index when calculating valid element size. The issue will be fixed in a future ROCm release. See [GitHub issue #5722](https://github.com/ROCm/ROCm/issues/5722).
|
||||
|
||||
### Reduced precision in gemm_ex operations for rocBLAS and hipBLAS
|
||||
### hipBLASLt performance variation for a particular FP8 GEMM operation on AMD Instinct MI325X GPUs
|
||||
|
||||
Some `gemm_ex` operations with `half` or `f32_r` data types might yield 16-bit precision results instead of the expected 32-bit precision when matrix dimensions are m=1 or n=1. The issue results from the optimization that enables `_ex` APIs to use lower precision multiples. It limits the high-precision matrix operations performed in PyTorch with rocBLAS and hipBLAS. The issue will be fixed in a future ROCm release. See [GitHub issue #5640](https://github.com/ROCm/ROCm/issues/5640).
|
||||
|
||||
### RCCL profiler plugin failure with AllToAll operations
|
||||
|
||||
The RCCL profiler plugin `librccl-profiler.so` might fail with a segmentation fault during `AllToAll` collective operations due to improperly assigned point-to-point task function pointers. This leads to invalid memory access and prevents profiling of `AllToAll` performance. Other operations, like `AllReduce`, are unaffected. It's recommended to avoid using the RCCL profiler plugin with `AllToAll` operations until the fix is available. This issue is resolved in the {fab}`github`[RCCL `develop` branch](https://github.com/ROCm/rccl/tree/develop) and will be part of a future ROCm release. See [GitHub issue #5653](https://github.com/ROCm/ROCm/issues/5653).
|
||||
If you’re using hipBLASLt on AMD Instinct MI325X GPUs for large FP8 GEMM operations (such as 9728x8192x65536), you might observe a noticeable performance variation. The issue is currently under investigation and will be fixed in a future ROCm release. See [GitHub issue #5734](https://github.com/ROCm/ROCm/issues/5734).
|
||||
|
||||
## ROCm resolved issues
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
|
||||
,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
|
||||
,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
|
||||
,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
|
||||
,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
|
||||
@@ -33,15 +33,14 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
|
||||
:doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
|
||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.51.1,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
@@ -68,7 +67,7 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
|
||||
@@ -81,12 +80,12 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
@@ -97,20 +96,20 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
|
||||
,,,,,,,,,,,,,,,,,,,,,,
|
||||
PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
|
||||
|
||||
|
@@ -32,7 +32,7 @@ compatibility and system requirements.
|
||||
,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
|
||||
,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
|
||||
,"Debian 13, 12","Debian 13, 12",Debian 12
|
||||
,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0
|
||||
,,,Azure Linux 3.0
|
||||
,Rocky Linux 9,Rocky Linux 9,
|
||||
,.. _architecture-support-compatibility-matrix:,,
|
||||
:doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
|
||||
@@ -59,7 +59,7 @@ compatibility and system requirements.
|
||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
|
||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
|
||||
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
|
||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.20.0
|
||||
,,,
|
||||
THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
|
||||
`UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
|
||||
@@ -85,7 +85,7 @@ compatibility and system requirements.
|
||||
,,,
|
||||
COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
|
||||
:doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
|
||||
:doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,2.0.0
|
||||
,,,
|
||||
MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
|
||||
`half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
|
||||
@@ -98,12 +98,12 @@ compatibility and system requirements.
|
||||
:doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
|
||||
:doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
|
||||
:doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,4.4.0
|
||||
:doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,4.4.0
|
||||
:doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
|
||||
:doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
|
||||
:doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
|
||||
:doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
|
||||
:doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,1.7.0
|
||||
:doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
|
||||
,,,
|
||||
PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
|
||||
@@ -114,20 +114,20 @@ compatibility and system requirements.
|
||||
,,,
|
||||
SUPPORT LIBS,,,
|
||||
`hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,6.4.0
|
||||
`rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,6.4.0
|
||||
`ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
|
||||
,,,
|
||||
SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,25.3.0
|
||||
:doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,25.3.0
|
||||
:doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
|
||||
:doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
|
||||
:doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
|
||||
:doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.1.0
|
||||
,,,
|
||||
PERFORMANCE TOOLS,,,
|
||||
:doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.0.0
|
||||
:doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.1.0
|
||||
:doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.0.0
|
||||
:doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
|
||||
:doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
|
||||
@@ -155,10 +155,10 @@ compatibility and system requirements.
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`_.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`_, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`_, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`_.
|
||||
.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#dgl_compat] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and ROCm 6.4.x.
|
||||
.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
@@ -169,44 +169,7 @@ compatibility and system requirements.
|
||||
Operating systems, kernel and Glibc versions
|
||||
*********************************************
|
||||
|
||||
Use this lookup table to confirm which operating system and kernel versions are supported with ROCm.
|
||||
|
||||
.. csv-table::
|
||||
:header: "OS", "Version", "Kernel", "Glibc"
|
||||
:widths: 40, 20, 30, 20
|
||||
:stub-columns: 1
|
||||
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
|
||||
,,
|
||||
`Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.1, 6.12.0-124, 2.39
|
||||
,10.0, 6.12.0-55, 2.39
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.7, 5.14.0-611, 2.34
|
||||
,9.6, 5.14.0-570, 2.34
|
||||
,9.5, 5.14+, 2.34
|
||||
,9.4, 5.14.0-427, 2.34
|
||||
,,
|
||||
`Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
|
||||
,,
|
||||
`SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
|
||||
,15 SP6, "6.5.0+, 6.4.0", 2.38
|
||||
,15 SP5, 5.14.21, 2.31
|
||||
,,
|
||||
`Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
|
||||
,,
|
||||
`Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
|
||||
,9, 6.12.0 (UEK), 2.34
|
||||
,8, 5.15.0 (UEK), 2.28
|
||||
,,
|
||||
`Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
|
||||
,12, 6.1.0, 2.36
|
||||
,,
|
||||
`Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
|
||||
,,
|
||||
For detailed information on operating system supported on ROCm 7.1.1 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -238,17 +201,16 @@ Expand for full historical view of:
|
||||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
|
||||
.. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
|
||||
.. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
|
||||
.. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
|
||||
.. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
|
||||
.. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
|
||||
.. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
|
||||
.. [#verl_compat-past-60] verl is only supported on ROCm 7.0.0 and 6.2.0.
|
||||
.. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||
.. [#dgl_compat-past-60] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
|
||||
.. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
|
||||
.. [#ray_compat-past-60] Ray is only supported on ROCm 7.0.0 and 6.4.1.
|
||||
.. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
|
||||
.. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 6.4.1.
|
||||
.. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
|
||||
.. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
|
||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||
|
||||
@@ -36,63 +36,9 @@ Support overview
|
||||
- You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
DGL is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__,
|
||||
`ROCm 6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__, and `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI250X
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
- 1D (Temporal) and 2D (Image) Classification
|
||||
- Drug Discovery
|
||||
|
||||
For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
* Although multiple use cases of DGL have been tested and verified, a few have been
|
||||
outlined in the `DGL in the Real World: Running GNNs on Real Use Cases
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog
|
||||
post, which walks through four real-world graph neural network (GNN) workloads
|
||||
implemented with the Deep Graph Library on ROCm. It covers tasks ranging from
|
||||
heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph
|
||||
regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use
|
||||
case, the authors detail: the dataset and task, how DGL is used, and their experience
|
||||
porting to ROCm. It is shown that DGL codebases often run without modification, with
|
||||
seamless integration of graph operations, message passing, sampling, and convolution.
|
||||
|
||||
* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__
|
||||
blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform,
|
||||
bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges
|
||||
the gap between dense tensor frameworks and the irregular nature of graph data through a
|
||||
graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and
|
||||
interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration
|
||||
enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers
|
||||
and open-source repositories. This marks a major step in AMD's mission to advance open,
|
||||
scalable AI ecosystems beyond traditional architectures.
|
||||
|
||||
You can pre-process datasets and begin training on AMD GPUs through:
|
||||
|
||||
* Single-GPU training/inference
|
||||
* Multi-GPU training
|
||||
|
||||
.. _dgl-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
@@ -114,6 +60,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -124,6 +71,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -134,6 +82,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -144,6 +93,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -154,6 +104,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -164,6 +115,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -174,7 +126,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -185,7 +137,7 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
- MI300X, MI250X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -196,7 +148,10 @@ Click the |docker-icon| to view the image on Docker Hub.
|
||||
- `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
|
||||
- 22.04
|
||||
- `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
|
||||
|
||||
- MI300X, MI250X
|
||||
|
||||
|
||||
.. _dgl-key-rocm-libraries:
|
||||
|
||||
Key ROCm libraries for DGL
|
||||
================================================================================
|
||||
@@ -310,8 +265,9 @@ If you prefer to build it yourself, ensure the following dependencies are instal
|
||||
multiplication (GEMM) and accumulation operations with mixed precision
|
||||
support.
|
||||
|
||||
.. _dgl-supported-features-latest:
|
||||
|
||||
Supported features
|
||||
Supported features with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
Many functions and methods available upstream are also supported in DGL on ROCm.
|
||||
@@ -335,14 +291,17 @@ Instead of listing them all, support is grouped into the following categories to
|
||||
* DGL Sparse
|
||||
* GraphBolt
|
||||
|
||||
Unsupported features
|
||||
.. _dgl-unsupported-features-latest:
|
||||
|
||||
Unsupported features with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
* TF32 Support (only supported for PyTorch 2.7 and above)
|
||||
* Kineto/ROCTracer integration
|
||||
|
||||
.. _dgl-unsupported-functions:
|
||||
|
||||
Unsupported functions
|
||||
Unsupported functions with ROCm 7.0.0
|
||||
================================================================================
|
||||
|
||||
* ``bfs``
|
||||
@@ -355,6 +314,50 @@ Unsupported functions
|
||||
* ``sample_labors_noprob``
|
||||
* ``sparse_admin``
|
||||
|
||||
.. _dgl-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
DGL can be used for Graph Learning, and building popular graph models like
|
||||
GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
|
||||
|
||||
- Recommender systems
|
||||
- Network Optimization and Analysis
|
||||
- 1D (Temporal) and 2D (Image) Classification
|
||||
- Drug Discovery
|
||||
|
||||
For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
* Although multiple use cases of DGL have been tested and verified, a few have been
|
||||
outlined in the `DGL in the Real World: Running GNNs on Real Use Cases
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog
|
||||
post, which walks through four real-world graph neural network (GNN) workloads
|
||||
implemented with the Deep Graph Library on ROCm. It covers tasks ranging from
|
||||
heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph
|
||||
regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use
|
||||
case, the authors detail: the dataset and task, how DGL is used, and their experience
|
||||
porting to ROCm. It is shown that DGL codebases often run without modification, with
|
||||
seamless integration of graph operations, message passing, sampling, and convolution.
|
||||
|
||||
* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__
|
||||
blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform,
|
||||
bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges
|
||||
the gap between dense tensor frameworks and the irregular nature of graph data through a
|
||||
graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and
|
||||
interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration
|
||||
enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers
|
||||
and open-source repositories. This marks a major step in AMD's mission to advance open,
|
||||
scalable AI ecosystems beyond traditional architectures.
|
||||
|
||||
You can pre-process datasets and begin training on AMD GPUs through:
|
||||
|
||||
* Single-GPU training/inference
|
||||
* Multi-GPU training
|
||||
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
|
||||
|
||||
@@ -42,38 +42,9 @@ Support overview
|
||||
- You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
|
||||
|
||||
.. _flashinfer-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
|
||||
In the decode phase, tokens are generated sequentially, with the model predicting each new
|
||||
token based on the previously generated tokens and the input context.
|
||||
|
||||
FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense
|
||||
attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
|
||||
|
||||
Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm
|
||||
also implements cascade attention from upstream to reduce memory usage.
|
||||
|
||||
For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
.. _flashinfer-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
@@ -95,6 +66,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -104,5 +76,23 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
|
||||
- 24.04
|
||||
- `3.12 <https://www.python.org/downloads/release/python-3129/>`__
|
||||
- MI300X
|
||||
|
||||
.. _flashinfer-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
The release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
|
||||
In the decode phase, tokens are generated sequentially, with the model predicting each new
|
||||
token based on the previously generated tokens and the input context.
|
||||
|
||||
FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense
|
||||
attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
|
||||
|
||||
Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm
|
||||
also implements cascade attention from upstream to reduce memory usage.
|
||||
|
||||
For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
|
||||
@@ -36,47 +36,9 @@ Support overview
|
||||
- You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
|
||||
`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI325X, MI300X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||
|
||||
- Plain C/C++ implementation with no external dependencies
|
||||
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||
|
||||
llama.cpp is also used in a range of real-world applications, including:
|
||||
|
||||
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||
- Various other AI applications use llama.cpp as their inference engine;
|
||||
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||
|
||||
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__
|
||||
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||
AMD Instinct GPUs within the ROCm ecosystem.
|
||||
|
||||
.. _llama-cpp-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
@@ -106,6 +68,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- llama.cpp
|
||||
- ROCm
|
||||
- Ubuntu
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -119,6 +82,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -132,6 +96,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -145,6 +110,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -158,7 +124,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
|
||||
- 22.04
|
||||
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -172,6 +138,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -185,7 +152,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
|
||||
- 22.04
|
||||
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -199,6 +166,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- 24.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -212,6 +180,7 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- 22.04
|
||||
- MI325X, MI300X, MI210
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
@@ -225,7 +194,9 @@ Click |docker-icon| to view the image on Docker Hub.
|
||||
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
|
||||
- `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
|
||||
- 24.04
|
||||
- MI300X, MI210
|
||||
|
||||
.. _llama-cpp-key-rocm-libraries:
|
||||
|
||||
Key ROCm libraries for llama.cpp
|
||||
================================================================================
|
||||
@@ -268,6 +239,36 @@ your corresponding ROCm version.
|
||||
- Can be used to enhance the flash attention performance on AMD compute, by enabling
|
||||
the flag during compile time.
|
||||
|
||||
.. _llama-cpp-uses-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||
|
||||
- Plain C/C++ implementation with no external dependencies
|
||||
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||
|
||||
llama.cpp is also used in a range of real-world applications, including:
|
||||
|
||||
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||
- Various other AI applications use llama.cpp as their inference engine;
|
||||
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||
|
||||
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__
|
||||
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||
AMD Instinct GPUs within the ROCm ecosystem.
|
||||
|
||||
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
|
||||
|
||||
@@ -33,19 +33,44 @@ Support overview
|
||||
- You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
Supported models and features
|
||||
--------------------------------------------------------------------------------
|
||||
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available Megablocks version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Megablocks
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- MI300X
|
||||
|
||||
Supported models and features with ROCm 6.3.0
|
||||
================================================================================
|
||||
|
||||
This section summarizes the Megablocks features supported by ROCm.
|
||||
|
||||
@@ -77,38 +102,3 @@ It features how to pre-process datasets and how to begin pre-training on AMD GPU
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
.. _megablocks-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated
|
||||
inventories represent the latest available Megablocks version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Megablocks
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
|
||||
@@ -349,7 +349,7 @@ with ROCm.
|
||||
you need to explicitly move audio data (waveform tensor) to GPU using
|
||||
``.to('cuda')``.
|
||||
|
||||
* - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
|
||||
* - `torchtune <https://meta-pytorch.org/torchtune/stable/index.html>`_
|
||||
- PyTorch-native library designed for fine-tuning large language models
|
||||
(LLMs). Provides supports the full fine-tuning workflow and offers
|
||||
compatibility with popular production inference systems.
|
||||
@@ -366,7 +366,7 @@ with ROCm.
|
||||
constructing flexible and performant data pipelines, with features still
|
||||
in prototype stage.
|
||||
|
||||
* - `torchrec <https://docs.pytorch.org/torchrec/>`_
|
||||
* - `torchrec <https://meta-pytorch.org/torchrec/>`_
|
||||
- PyTorch domain library for common sparsity and parallelism primitives
|
||||
needed for large-scale recommender systems, enabling authors to train
|
||||
models with large embedding tables shared across many GPUs.
|
||||
@@ -401,25 +401,25 @@ with ROCm.
|
||||
|
||||
Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
|
||||
================================================================================
|
||||
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b
|
||||
- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b.
|
||||
|
||||
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later
|
||||
- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later.
|
||||
|
||||
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization
|
||||
- MIOpen now supports channels last memory format for 3D convolutions and batch normalization.
|
||||
|
||||
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations
|
||||
- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations.
|
||||
|
||||
- Improved tensor.item() performance by removing redundant synchronization
|
||||
- Improved tensor.item() performance by removing redundant synchronization.
|
||||
|
||||
- Enhanced performance for element-wise operations and reduction kernels
|
||||
- Enhanced performance for element-wise operations and reduction kernels.
|
||||
|
||||
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components
|
||||
- Added support for grouped GEMM operations through fbgemm_gpu generative AI components.
|
||||
|
||||
- Resolved device error in Inductor when using CUDA graph trees with HIP
|
||||
- Resolved device error in Inductor when using CUDA graph trees with HIP.
|
||||
|
||||
- Corrected logsumexp scaling in AOTriton-based SDPA implementation
|
||||
- Corrected logsumexp scaling in AOTriton-based SDPA implementation.
|
||||
|
||||
- Added stream graph capture status validation in memory copy synchronization functions
|
||||
- Added stream graph capture status validation in memory copy synchronization functions.
|
||||
|
||||
Key features and enhancements for PyTorch 2.8 with ROCm 7.1
|
||||
================================================================================
|
||||
|
||||
@@ -12,8 +12,8 @@ Ray compatibility
|
||||
|
||||
Ray is a unified framework for scaling AI and Python applications from your laptop
|
||||
to a full cluster, without changing your code. Ray consists of `a core distributed
|
||||
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of
|
||||
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for
|
||||
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`__ and a set of
|
||||
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`__ for
|
||||
simplifying machine learning computations.
|
||||
|
||||
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||
@@ -29,25 +29,57 @@ Support overview
|
||||
- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`,
|
||||
which includes ROCm, Ray, and all required dependencies.
|
||||
|
||||
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels
|
||||
<https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. _ray-docker-compat:
|
||||
|
||||
Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.51.1_rocm7.0.0_ubuntu22.04_py3.12_pytorch2.9.0/images/sha256-a02f6766b4ba406f88fd7e85707ec86c04b569834d869a08043ec9bcbd672168"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `2.51.1 <https://github.com/ROCm/ray/tree/release/2.51.1>`__
|
||||
- 2.9.0a0+git1c57644
|
||||
- 22.04
|
||||
- `3.12.12 <https://www.python.org/downloads/release/python-31212/>`__
|
||||
- MI300X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`__
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
|
||||
- MI300X, MI210
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
@@ -76,36 +108,7 @@ topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accel
|
||||
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
|
||||
|
||||
.. _ray-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||
associated inventories represent the latest Ray version from the official Docker Hub.
|
||||
Click the |docker-icon| icon to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Ray
|
||||
- Pytorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||
- `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
|
||||
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||
- 2.6.0+git684f6f2
|
||||
- 24.04
|
||||
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||
Previous versions
|
||||
===============================================================================
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/ray-history` to find documentation for previous releases
|
||||
of the ``ROCm/ray`` Docker image.
|
||||
|
||||
@@ -35,19 +35,45 @@ Support overview
|
||||
- You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI300X
|
||||
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
Supported models and features
|
||||
--------------------------------------------------------------------------------
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i> rocm/stanford-megatron-lm</a>
|
||||
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
- MI300X
|
||||
|
||||
Supported models and features with ROCm 6.3.0
|
||||
================================================================================
|
||||
|
||||
This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
|
||||
|
||||
@@ -88,41 +114,3 @@ It features how to pre-process datasets and how to begin pre-training on AMD GPU
|
||||
|
||||
* Single-GPU pre-training
|
||||
* Multi-GPU pre-training
|
||||
|
||||
.. _megatron-lm-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
|
||||
with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
|
||||
inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Stanford Megatron-LM
|
||||
- PyTorch
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
|
||||
|
||||
- `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
|
||||
- `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
|
||||
- `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
|
||||
- 24.04
|
||||
- `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Taichi compatibility
|
||||
:keywords: GPU, Taichi, deep learning, framework compatibility
|
||||
|
||||
.. version-set:: rocm_version latest
|
||||
|
||||
*******************************************************************************
|
||||
Taichi compatibility
|
||||
*******************************************************************************
|
||||
|
||||
`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel
|
||||
programming language designed for high-performance numerical computation.
|
||||
Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate
|
||||
compute-intensive Python code by compiling it to native GPU or CPU instructions.
|
||||
|
||||
Taichi is widely used across various domains, including real-time physical simulation,
|
||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
||||
visual effects in film and gaming, and general-purpose computing.
|
||||
|
||||
Support overview
|
||||
================================================================================
|
||||
|
||||
- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi
|
||||
<https://github.com/ROCm/taichi>`__ repository, which differs from the
|
||||
`https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.
|
||||
|
||||
- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`,
|
||||
which includes ROCm, Taichi, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
|
||||
- **Upcoming Support**: AMD Instinct™ MI300X
|
||||
|
||||
.. _taichi-recommendations:
|
||||
|
||||
Use cases and recommendations
|
||||
================================================================================
|
||||
|
||||
* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs
|
||||
<https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
|
||||
blog highlights Taichi as an open-source programming language designed for high-performance
|
||||
numerical computation, particularly in domains like real-time physical simulation,
|
||||
artificial intelligence, computer vision, robotics, and visual effects. Taichi
|
||||
is embedded in Python and uses just-in-time (JIT) compilation frameworks like
|
||||
LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
|
||||
of Taichi in enabling complex simulations and numerical algorithms, making
|
||||
it ideal for developers working on compute-intensive tasks. Developers are
|
||||
encouraged to follow recommended coding patterns and utilize Taichi decorators
|
||||
for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples
|
||||
<https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images
|
||||
integrating ROCm, PyTorch, and Taichi are provided for simplified installation
|
||||
and deployment, making it easier to leverage Taichi for advanced computational workloads.
|
||||
|
||||
.. _taichi-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest Taichi version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- Taichi
|
||||
- Ubuntu
|
||||
- Python
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
|
||||
- `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
|
||||
- `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
|
||||
- 22.04
|
||||
- `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
|
||||
@@ -31,21 +31,70 @@ Support overview
|
||||
- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`,
|
||||
which includes ROCm, verl, and all required dependencies.
|
||||
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
- See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
|
||||
for installation and setup instructions.
|
||||
|
||||
- You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__
|
||||
for additional context.
|
||||
|
||||
Version support
|
||||
--------------------------------------------------------------------------------
|
||||
.. _verl-docker-compat:
|
||||
|
||||
verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
|
||||
Compatibility matrix
|
||||
================================================================================
|
||||
|
||||
Supported devices
|
||||
--------------------------------------------------------------------------------
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
**Officially Supported**: AMD Instinct™ MI300X
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:class: docker-image-compatibility
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- PyTorch
|
||||
- Python
|
||||
- vllm
|
||||
- GPU
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.6.0.amd0_rocm7.0_vllm0.11.0.dev/images/sha256-f70a3ebc94c1f66de42a2fcc3f8a6a8d6d0881eb0e65b6958d7d6d24b3eecb0d"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
|
||||
- `0.6.0 <https://github.com/volcengine/verl/releases/tag/v0.6.0>`__
|
||||
- 22.04
|
||||
- `2.9.0 <https://github.com/ROCm/pytorch/tree/release/2.9-rocm7.x-gfx115x>`__
|
||||
- `3.12.11 <https://www.python.org/downloads/release/python-31211/>`__
|
||||
- `0.11.0 <https://github.com/vllm-project/vllm/releases/tag/v0.11.0>`__
|
||||
- MI300X
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__
|
||||
- `0.3.0.post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`__
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`__
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`__
|
||||
- MI300X
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
Supported modules with verl on ROCm
|
||||
===============================================================================
|
||||
|
||||
The following GPU-accelerated modules are supported with verl on ROCm:
|
||||
|
||||
- ``FSDP``: Training engine
|
||||
- ``vllm``: Inference engine
|
||||
|
||||
.. _verl-recommendations:
|
||||
|
||||
@@ -57,66 +106,13 @@ Use cases and recommendations
|
||||
GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||
blog. The blog post outlines how the Volcano Engine Reinforcement Learning
|
||||
(verl) framework integrates with the AMD ROCm platform to optimize training on
|
||||
Instinct™ MI300X GPUs. The guide details the process of building a Docker image,
|
||||
AMD Instinct™ GPUs. The guide details the process of building a Docker image,
|
||||
setting up single-node and multi-node training environments, and highlights
|
||||
performance benchmarks demonstrating improved throughput and convergence accuracy.
|
||||
This resource serves as a comprehensive starting point for deploying verl on AMD GPUs,
|
||||
facilitating efficient RLHF training workflows.
|
||||
|
||||
.. _verl-supported_features:
|
||||
|
||||
Supported features
|
||||
Previous versions
|
||||
===============================================================================
|
||||
|
||||
The following table shows verl on ROCm support for GPU-accelerated modules.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Module
|
||||
- Description
|
||||
- verl version
|
||||
- ROCm version
|
||||
* - ``FSDP``
|
||||
- Training engine
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
* - ``vllm``
|
||||
- Inference engine
|
||||
- 0.3.0.post0
|
||||
- 6.2.0
|
||||
|
||||
.. _verl-docker-compat:
|
||||
|
||||
Docker image compatibility
|
||||
================================================================================
|
||||
|
||||
.. |docker-icon| raw:: html
|
||||
|
||||
<i class="fab fa-docker"></i>
|
||||
|
||||
AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
|
||||
with ROCm backends on Docker Hub. The following Docker image tag and associated inventories
|
||||
represent the latest verl version from the official Docker Hub.
|
||||
Click |docker-icon| to view the image on Docker Hub.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image
|
||||
- ROCm
|
||||
- verl
|
||||
- Ubuntu
|
||||
- Pytorch
|
||||
- Python
|
||||
- vllm
|
||||
|
||||
* - .. raw:: html
|
||||
|
||||
<a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
|
||||
- `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_
|
||||
- `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
|
||||
- 20.04
|
||||
- `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
|
||||
- `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
|
||||
- `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`_
|
||||
See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/verl-history` to find documentation for previous releases
|
||||
of the ``ROCm/verl`` Docker image.
|
||||
|
||||
28
docs/conf.py
28
docs/conf.py
@@ -111,7 +111,6 @@ article_pages = [
|
||||
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||
{"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
|
||||
@@ -136,9 +135,15 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||
@@ -146,13 +151,19 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
|
||||
@@ -177,8 +188,16 @@ article_pages = [
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
|
||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||
|
||||
{"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
|
||||
@@ -249,3 +268,6 @@ html_context = {
|
||||
"granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
|
||||
"scope_type" : [('Device', 'device'), ('System', 'system')]
|
||||
}
|
||||
|
||||
# Disable figure and table numbering
|
||||
numfig = False
|
||||
|
||||
@@ -0,0 +1,316 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_vllm_llama-2-70b
|
||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 4096
|
||||
max_model_len: 4096
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_vllm_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 8B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_vllm_llama-3.1-405b
|
||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B FP8
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.1 405B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.1-405b_fp4
|
||||
model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_vllm_llama-3.3-70b
|
||||
model_repo: meta-llama/Llama-3.3-70B-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B FP8
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp8
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 3.3 70B MXFP4
|
||||
mad_tag: pyt_vllm_llama-3.3-70b_fp4
|
||||
model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
|
||||
precision: float4
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Scout 17Bx16E
|
||||
mad_tag: pyt_vllm_llama-4-scout-17b-16e
|
||||
model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Llama 4 Maverick 17Bx128E FP8
|
||||
mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
|
||||
model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek R1 0528 FP8
|
||||
mad_tag: pyt_vllm_deepseek-r1
|
||||
model_repo: deepseek-ai/DeepSeek-R1-0528
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_seqs: 1024
|
||||
max_num_batched_tokens: 131072
|
||||
max_model_len: 8192
|
||||
- group: OpenAI GPT OSS
|
||||
tag: gpt-oss
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_vllm_gpt-oss-20b
|
||||
model_repo: openai/gpt-oss-20b
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_vllm_gpt-oss-120b
|
||||
model_repo: openai/gpt-oss-120b
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: bfloat16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 8192
|
||||
max_model_len: 8192
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral MoE 8x7B
|
||||
mad_tag: pyt_vllm_mixtral-8x7b
|
||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x7B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 32768
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B
|
||||
mad_tag: pyt_vllm_mixtral-8x22b
|
||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- model: Mixtral MoE 8x22B FP8
|
||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 65536
|
||||
max_model_len: 8192
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen3 8B
|
||||
mad_tag: pyt_vllm_qwen3-8b
|
||||
model_repo: Qwen/Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 32B
|
||||
mad_tag: pyt_vllm_qwen3-32b
|
||||
model_repo: Qwen/Qwen3-32b
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||
model_repo: Qwen/Qwen3-30B-A3B
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 30B A3B FP8
|
||||
mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
|
||||
model_repo: Qwen/Qwen3-30B-A3B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b
|
||||
model_repo: Qwen/Qwen3-235B-A22B
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B
|
||||
precision: float16
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- model: Qwen3 235B A22B FP8
|
||||
mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
|
||||
model_repo: Qwen/Qwen3-235B-A22B-FP8
|
||||
url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
|
||||
precision: float8
|
||||
config:
|
||||
tp: 8
|
||||
dtype: auto
|
||||
kv_cache_dtype: fp8
|
||||
max_num_batched_tokens: 40960
|
||||
max_model_len: 8192
|
||||
- group: Microsoft Phi
|
||||
tag: phi
|
||||
models:
|
||||
- model: Phi-4
|
||||
mad_tag: pyt_vllm_phi-4
|
||||
model_repo: microsoft/phi-4
|
||||
url: https://huggingface.co/microsoft/phi-4
|
||||
precision: float16
|
||||
config:
|
||||
tp: 1
|
||||
dtype: auto
|
||||
kv_cache_dtype: auto
|
||||
max_num_batched_tokens: 16384
|
||||
max_model_len: 8192
|
||||
@@ -0,0 +1,55 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e
|
||||
ROCm: 7.9.0
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -0,0 +1,109 @@
|
||||
xdit_diffusion_inference:
|
||||
docker:
|
||||
- version: v25-11
|
||||
pull_tag: rocm/pytorch-xdit:v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216
|
||||
ROCm: 7.10.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "Minor bug fixes and clarifications to READMEs."
|
||||
- "Bumps TheRock, AITER, Diffusers, xDiT versions."
|
||||
- "Changes Aiter rounding mode for faster gfx942 FWD Attention."
|
||||
components:
|
||||
TheRock: 3e3f834
|
||||
rccl: d23d18f
|
||||
composable_kernel: 2570462
|
||||
rocm-libraries: 0588f07
|
||||
rocm-systems: 473025a
|
||||
torch: 73adac
|
||||
torchvision: f5c6c2e
|
||||
triton: 7416ffc
|
||||
accelerate: 34c1779
|
||||
aiter: de14bec
|
||||
diffusers: 40528e9
|
||||
xfuser: 83978b5
|
||||
yunchang: 2c9b712
|
||||
|
||||
- version: v25-10
|
||||
pull_tag: rocm/pytorch-xdit:v25.10
|
||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
|
||||
ROCm: 7.9.0
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
models:
|
||||
- Hunyuan Video
|
||||
- group: Wan-AI
|
||||
models:
|
||||
- Wan2.1
|
||||
- Wan2.2
|
||||
- group: FLUX
|
||||
models:
|
||||
- FLUX.1
|
||||
whats_new:
|
||||
- "First official xDiT Docker Release for Diffusion Inference."
|
||||
- "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
|
||||
- "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
|
||||
components:
|
||||
TheRock: 7afbe45
|
||||
rccl: 9b04b2a
|
||||
composable_kernel: b7a806f
|
||||
rocm-libraries: f104555
|
||||
rocm-systems: 25922d0
|
||||
torch: 2.10.0a0+gite9c9017
|
||||
torchvision: 0.22.0a0+966da7e
|
||||
triton: 3.5.0+git52e49c12
|
||||
accelerate: 1.11.0.dev0
|
||||
aiter: 0.1.5.post4.dev20+ga25e55e79
|
||||
diffusers: 0.36.0.dev0
|
||||
xfuser: 0.4.4
|
||||
yunchang: 0.6.3.post1
|
||||
|
||||
model_groups:
|
||||
- group: Hunyuan Video
|
||||
tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
page_tag: hunyuan_tag
|
||||
model_name: hunyuanvideo
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
- group: Wan-AI
|
||||
tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
page_tag: wan_21_tag
|
||||
model_name: wan2_1-i2v-14b-720p
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
- model: Wan2.2
|
||||
page_tag: wan_22_tag
|
||||
model_name: wan2_2-i2v-a14b
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
- group: FLUX
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
page_tag: flux_1_tag
|
||||
model_name: FLUX.1-dev
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
@@ -0,0 +1,91 @@
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.12
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256
|
||||
ROCm: 7.10.0
|
||||
whats_new:
|
||||
- "Adds T2V and TI2V support for Wan models."
|
||||
- "Adds support for SD-3.5 T2I model."
|
||||
components:
|
||||
TheRock:
|
||||
version: 3e3f834
|
||||
url: https://github.com/ROCm/TheRock
|
||||
rccl:
|
||||
version: d23d18f
|
||||
url: https://github.com/ROCm/rccl
|
||||
composable_kernel:
|
||||
version: 2570462
|
||||
url: https://github.com/ROCm/composable_kernel
|
||||
rocm-libraries:
|
||||
version: 0588f07
|
||||
url: https://github.com/ROCm/rocm-libraries
|
||||
rocm-systems:
|
||||
version: 473025a
|
||||
url: https://github.com/ROCm/rocm-systems
|
||||
torch:
|
||||
version: 73adac
|
||||
url: https://github.com/pytorch/pytorch
|
||||
torchvision:
|
||||
version: f5c6c2e
|
||||
url: https://github.com/pytorch/vision
|
||||
triton:
|
||||
version: 7416ffc
|
||||
url: https://github.com/triton-lang/triton
|
||||
accelerate:
|
||||
version: 34c1779
|
||||
url: https://github.com/huggingface/accelerate
|
||||
aiter:
|
||||
version: de14bec
|
||||
url: https://github.com/ROCm/aiter
|
||||
diffusers:
|
||||
version: 40528e9
|
||||
url: https://github.com/huggingface/diffusers
|
||||
xfuser:
|
||||
version: ccba9d5
|
||||
url: https://github.com/xdit-project/xDiT
|
||||
yunchang:
|
||||
version: 2c9b712
|
||||
url: https://github.com/feifeibear/long-context-attention
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
js_tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
js_tag: hunyuan_tag
|
||||
- group: Wan-AI
|
||||
js_tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
js_tag: wan_21_tag
|
||||
- model: Wan2.2
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
js_tag: wan_22_tag
|
||||
- group: FLUX
|
||||
js_tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
js_tag: flux_1_tag
|
||||
- group: Stable Diffusion
|
||||
js_tag: stablediffusion
|
||||
models:
|
||||
- model: stable-diffusion-3.5-large
|
||||
model_repo: stabilityai/stable-diffusion-3.5-large
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
|
||||
github: https://github.com/Stability-AI/sd3.5
|
||||
mad_tag: pyt_xdit_sd_3_5
|
||||
js_tag: stable_diffusion_3_5_large_tag
|
||||
@@ -1,13 +1,13 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
|
||||
- pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
|
||||
vLLM: 0.11.2 (0.11.2.dev673+g839868462.rocm700)
|
||||
PyTorch: 2.9.0a0+git1c57644
|
||||
hipBLASLt: 1.0.0
|
||||
dockerfile:
|
||||
commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
|
||||
commit: 8398684622109c806a35d660647060b0b9910663
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
docker:
|
||||
pull_tag: rocm/pytorch-xdit:v25.13
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
|
||||
ROCm: 7.11.0
|
||||
whats_new:
|
||||
- "Flux.1 Kontext support"
|
||||
- "Flux.2 Dev support"
|
||||
- "Flux FP8 GEMM support"
|
||||
- "Hybrid FP8 attention support for Wan models"
|
||||
components:
|
||||
TheRock:
|
||||
version: 1728a81
|
||||
url: https://github.com/ROCm/TheRock
|
||||
rccl:
|
||||
version: d23d18f
|
||||
url: https://github.com/ROCm/rccl
|
||||
composable_kernel:
|
||||
version: ab0101c
|
||||
url: https://github.com/ROCm/composable_kernel
|
||||
rocm-libraries:
|
||||
version: a2f7c35
|
||||
url: https://github.com/ROCm/rocm-libraries
|
||||
rocm-systems:
|
||||
version: 659737c
|
||||
url: https://github.com/ROCm/rocm-systems
|
||||
torch:
|
||||
version: 91be249
|
||||
url: https://github.com/ROCm/pytorch
|
||||
torchvision:
|
||||
version: b919bd0
|
||||
url: https://github.com/pytorch/vision
|
||||
triton:
|
||||
version: a272dfa
|
||||
url: https://github.com/ROCm/triton
|
||||
accelerate:
|
||||
version: b521400f
|
||||
url: https://github.com/huggingface/accelerate
|
||||
aiter:
|
||||
version: de14bec0
|
||||
url: https://github.com/ROCm/aiter
|
||||
diffusers:
|
||||
version: a1f36ee3e
|
||||
url: https://github.com/huggingface/diffusers
|
||||
xfuser:
|
||||
version: adf2681
|
||||
url: https://github.com/xdit-project/xDiT
|
||||
yunchang:
|
||||
version: 2c9b712
|
||||
url: https://github.com/feifeibear/long-context-attention
|
||||
supported_models:
|
||||
- group: Hunyuan Video
|
||||
js_tag: hunyuan
|
||||
models:
|
||||
- model: Hunyuan Video
|
||||
model_repo: tencent/HunyuanVideo
|
||||
revision: refs/pr/18
|
||||
url: https://huggingface.co/tencent/HunyuanVideo
|
||||
github: https://github.com/Tencent-Hunyuan/HunyuanVideo
|
||||
mad_tag: pyt_xdit_hunyuanvideo
|
||||
js_tag: hunyuan_tag
|
||||
- group: Wan-AI
|
||||
js_tag: wan
|
||||
models:
|
||||
- model: Wan2.1
|
||||
model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.1
|
||||
mad_tag: pyt_xdit_wan_2_1
|
||||
js_tag: wan_21_tag
|
||||
- model: Wan2.2
|
||||
model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
github: https://github.com/Wan-Video/Wan2.2
|
||||
mad_tag: pyt_xdit_wan_2_2
|
||||
js_tag: wan_22_tag
|
||||
- group: FLUX
|
||||
js_tag: flux
|
||||
models:
|
||||
- model: FLUX.1
|
||||
model_repo: black-forest-labs/FLUX.1-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux
|
||||
js_tag: flux_1_tag
|
||||
- model: FLUX.1 Kontext
|
||||
model_repo: black-forest-labs/FLUX.1-Kontext-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
|
||||
github: https://github.com/black-forest-labs/flux
|
||||
mad_tag: pyt_xdit_flux_kontext
|
||||
js_tag: flux_1_kontext_tag
|
||||
- model: FLUX.2
|
||||
model_repo: black-forest-labs/FLUX.2-dev
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.2-dev
|
||||
github: https://github.com/black-forest-labs/flux2
|
||||
mad_tag: pyt_xdit_flux_2
|
||||
js_tag: flux_2_tag
|
||||
- group: StableDiffusion
|
||||
js_tag: stablediffusion
|
||||
models:
|
||||
- model: stable-diffusion-3.5-large
|
||||
model_repo: stabilityai/stable-diffusion-3.5-large
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
|
||||
github: https://github.com/Stability-AI/sd3.5
|
||||
mad_tag: pyt_xdit_sd_3_5
|
||||
js_tag: stable_diffusion_3_5_large_tag
|
||||
@@ -1,12 +1,12 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.9
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
JAX: 0.6.2
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+c91bac54
|
||||
hipBLASLt: 1.x.x
|
||||
ROCm: 7.1.0
|
||||
JAX: 0.7.1
|
||||
Python: 3.12
|
||||
Transformer Engine: 2.4.0.dev0+281042de
|
||||
hipBLASLt: 1.2.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,21 +1,17 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
dockers:
|
||||
- pull_tag: rocm/jax-training:maxtext-v25.9.1
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6
|
||||
components:
|
||||
ROCm: 7.0.0
|
||||
JAX: 0.6.2
|
||||
Python: 3.10.18
|
||||
Transformer Engine: 2.2.0.dev0+c91bac54
|
||||
hipBLASLt: 1.x.x
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 2 7B
|
||||
mad_tag: jax_maxtext_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_7b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: jax_maxtext_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
precision: bf16
|
||||
multinode_training_script: llama2_70b_multinode.sh
|
||||
doc_options: ["single-node", "multi-node"]
|
||||
- model: Llama 3 8B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-8b
|
||||
multinode_training_script: llama3_8b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3 70B (multi-node)
|
||||
mad_tag: jax_maxtext_train_llama-3-70b
|
||||
multinode_training_script: llama3_70b_multinode.sh
|
||||
doc_options: ["multi-node"]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V2-Lite (16B)
|
||||
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||
model_repo: DeepSeek-V2-lite
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||
model_repo: Mixtral-8x7B
|
||||
precision: bf16
|
||||
doc_options: ["single-node"]
|
||||
@@ -0,0 +1,49 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -0,0 +1,53 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/megatron-lm:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||
@@ -0,0 +1,58 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
config_name: llama3.3_70B-pretrain.yaml
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
config_name: llama3.1_70B-pretrain.yaml
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
config_name: llama3.1_8B-pretrain.yaml
|
||||
- model: Llama 2 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||
config_name: llama2_7B-pretrain.yaml
|
||||
- model: Llama 2 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||
config_name: llama2_70B-pretrain.yaml
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
config_name: deepseek_v3-pretrain.yaml
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
config_name: deepseek_v2_lite-pretrain.yaml
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
config_name: qwen2.5_72B-pretrain.yaml
|
||||
@@ -0,0 +1,65 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||
config_name: llama3.3_70B-pretrain.yaml
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
config_name: llama3.1_70B-pretrain.yaml
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
config_name: llama3.1_8B-pretrain.yaml
|
||||
- model: Llama 2 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||
config_name: llama2_7B-pretrain.yaml
|
||||
- model: Llama 2 70B
|
||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||
config_name: llama2_70B-pretrain.yaml
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek-V3 (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
config_name: deepseek_v3-pretrain.yaml
|
||||
- model: DeepSeek-V2-Lite
|
||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
config_name: deepseek_v2_lite-pretrain.yaml
|
||||
- group: Mistral AI
|
||||
tag: mistral
|
||||
models:
|
||||
- model: Mixtral 8x7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||
- model: Mixtral 8x22B (proxy)
|
||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 2.5 7B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
config_name: qwen2.5_72B-pretrain.yaml
|
||||
@@ -0,0 +1,32 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V2 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v2
|
||||
model_repo: DeepSeek-V2
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
||||
precision: BF16
|
||||
@@ -0,0 +1,39 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_8b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_8b_fsdp_fp8.toml"
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_70b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_70b_fsdp_fp8.toml"
|
||||
@@ -0,0 +1,197 @@
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora]
|
||||
- group: OpenAI
|
||||
tag: openai
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_train_gpt_oss_20b
|
||||
model_repo: GPT-OSS-20B
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_train_gpt_oss_120b
|
||||
model_repo: GPT-OSS-120B
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V2 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v2
|
||||
model_repo: DeepSeek-V2
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 3 8B
|
||||
mad_tag: pyt_train_qwen3-8b
|
||||
model_repo: Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 3 32B
|
||||
mad_tag: pyt_train_qwen3-32b
|
||||
model_repo: Qwen3-32
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 32B
|
||||
mad_tag: pyt_train_qwen2.5-32b
|
||||
model_repo: Qwen2.5-32B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_train_qwen2.5-72b
|
||||
model_repo: Qwen2.5-72B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2 1.5B
|
||||
mad_tag: pyt_train_qwen2-1.5b
|
||||
model_repo: Qwen2-1.5B
|
||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 2 7B
|
||||
mad_tag: pyt_train_qwen2-7b
|
||||
model_repo: Qwen2-7B
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Stable Diffusion
|
||||
tag: sd
|
||||
models:
|
||||
- model: Stable Diffusion XL
|
||||
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [posttrain]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [posttrain]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/ROCm/FluxBenchmark
|
||||
precision: FP32
|
||||
- group: DLRM
|
||||
tag: dlrm
|
||||
models:
|
||||
- model: DLRM v2
|
||||
mad_tag: pyt_train_dlrm
|
||||
model_repo: DLRM
|
||||
url: https://github.com/AMD-AGI/DLRMBenchmark
|
||||
training_modes: [pretrain]
|
||||
@@ -0,0 +1,186 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 4 Scout 17B-16E
|
||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||
model_repo: Llama-4-17B_16E
|
||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.3 70B
|
||||
mad_tag: pyt_train_llama-3.3-70b
|
||||
model_repo: Llama-3.3-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 3.2 1B
|
||||
mad_tag: pyt_train_llama-3.2-1b
|
||||
model_repo: Llama-3.2-1B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 3B
|
||||
mad_tag: pyt_train_llama-3.2-3b
|
||||
model_repo: Llama-3.2-3B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3.2 Vision 11B
|
||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||
model_repo: Llama-3.2-Vision-11B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.2 Vision 90B
|
||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||
model_repo: Llama-3.2-Vision-90B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw]
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||
precision: BF16
|
||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||
- model: Llama 3.1 405B
|
||||
mad_tag: pyt_train_llama-3.1-405b
|
||||
model_repo: Llama-3.1-405B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||
precision: BF16
|
||||
training_modes: [finetune_qlora]
|
||||
- model: Llama 3 8B
|
||||
mad_tag: pyt_train_llama-3-8b
|
||||
model_repo: Llama-3-8B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 3 70B
|
||||
mad_tag: pyt_train_llama-3-70b
|
||||
model_repo: Llama-3-70B
|
||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 7B
|
||||
mad_tag: pyt_train_llama-2-7b
|
||||
model_repo: Llama-2-7B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||
- model: Llama 2 13B
|
||||
mad_tag: pyt_train_llama-2-13b
|
||||
model_repo: Llama-2-13B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Llama 2 70B
|
||||
mad_tag: pyt_train_llama-2-70b
|
||||
model_repo: Llama-2-70B
|
||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora, finetune_qlora]
|
||||
- group: OpenAI
|
||||
tag: openai
|
||||
models:
|
||||
- model: GPT OSS 20B
|
||||
mad_tag: pyt_train_gpt_oss_20b
|
||||
model_repo: GPT-OSS-20B
|
||||
url: https://huggingface.co/openai/gpt-oss-20b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- model: GPT OSS 120B
|
||||
mad_tag: pyt_train_gpt_oss_120b
|
||||
model_repo: GPT-OSS-120B
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
- model: Qwen 3 8B
|
||||
mad_tag: pyt_train_qwen3-8b
|
||||
model_repo: Qwen3-8B
|
||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 3 32B
|
||||
mad_tag: pyt_train_qwen3-32b
|
||||
model_repo: Qwen3-32
|
||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 32B
|
||||
mad_tag: pyt_train_qwen2.5-32b
|
||||
model_repo: Qwen2.5-32B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2.5 72B
|
||||
mad_tag: pyt_train_qwen2.5-72b
|
||||
model_repo: Qwen2.5-72B
|
||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||
precision: BF16
|
||||
training_modes: [finetune_lora]
|
||||
- model: Qwen 2 1.5B
|
||||
mad_tag: pyt_train_qwen2-1.5b
|
||||
model_repo: Qwen2-1.5B
|
||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- model: Qwen 2 7B
|
||||
mad_tag: pyt_train_qwen2-7b
|
||||
model_repo: Qwen2-7B
|
||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||
precision: BF16
|
||||
training_modes: [finetune_fw, finetune_lora]
|
||||
- group: Stable Diffusion
|
||||
tag: sd
|
||||
models:
|
||||
- model: Stable Diffusion XL
|
||||
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
- model: FLUX.1-dev
|
||||
mad_tag: pyt_train_flux
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
|
||||
precision: FP32
|
||||
@@ -1,22 +1,15 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
Triton: 3.4.0
|
||||
RCCL: 2.27.7
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
|
||||
@@ -1,39 +1,32 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/primus:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/primus:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.11
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
models:
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: meta-llama/Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_8b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_8b_fsdp_fp8.toml"
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: meta-llama/Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
config_file:
|
||||
bf16: "./llama3_70b_fsdp_bf16.toml"
|
||||
fp8: "./llama3_70b_fsdp_fp8.toml"
|
||||
- model: Llama 3.1 8B
|
||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
||||
model_repo: Llama-3.1-8B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||
precision: BF16
|
||||
- model: Llama 3.1 70B
|
||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
||||
model_repo: Llama-3.1-70B
|
||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||
precision: BF16
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V3 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v3-16b
|
||||
model_repo: DeepSeek-V3
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||
precision: BF16
|
||||
|
||||
@@ -1,21 +1,15 @@
|
||||
dockers:
|
||||
MI355X and MI350X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx950
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
|
||||
components: &docker_components
|
||||
ROCm: 7.0.0
|
||||
Primus: aab4234
|
||||
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.2.0.dev0+54dd2bdc
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 911283acd1
|
||||
Triton: 3.4.0+rocm7.0.0.git56765e8c
|
||||
RCCL: 2.26.6
|
||||
MI325X and MI300X:
|
||||
pull_tag: rocm/pytorch-training:v25.9_gfx942
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
|
||||
components: *docker_components
|
||||
docker:
|
||||
pull_tag: rocm/primus:v25.10
|
||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||
components:
|
||||
ROCm: 7.1.0
|
||||
Primus: 0.3.0
|
||||
Primus Turbo: 0.1.1
|
||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||
Python: "3.10"
|
||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||
Flash Attention: 2.8.3
|
||||
hipBLASLt: 1.2.0-09ab7153e2
|
||||
model_groups:
|
||||
- group: Meta Llama
|
||||
tag: llama
|
||||
@@ -119,6 +113,15 @@ model_groups:
|
||||
url: https://huggingface.co/openai/gpt-oss-120b
|
||||
precision: BF16
|
||||
training_modes: [HF_finetune_lora]
|
||||
- group: DeepSeek
|
||||
tag: deepseek
|
||||
models:
|
||||
- model: DeepSeek V2 16B
|
||||
mad_tag: primus_pyt_train_deepseek-v2
|
||||
model_repo: DeepSeek-V2
|
||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
||||
precision: BF16
|
||||
training_modes: [pretrain]
|
||||
- group: Qwen
|
||||
tag: qwen
|
||||
models:
|
||||
@@ -166,7 +169,7 @@ model_groups:
|
||||
model_repo: SDXL
|
||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
training_modes: [posttrain]
|
||||
- group: Flux
|
||||
tag: flux
|
||||
models:
|
||||
@@ -175,12 +178,20 @@ model_groups:
|
||||
model_repo: Flux
|
||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||
precision: BF16
|
||||
training_modes: [posttrain-p]
|
||||
training_modes: [posttrain]
|
||||
- group: NCF
|
||||
tag: ncf
|
||||
models:
|
||||
- model: NCF
|
||||
mad_tag: pyt_ncf_training
|
||||
model_repo:
|
||||
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
|
||||
url: https://github.com/ROCm/FluxBenchmark
|
||||
precision: FP32
|
||||
- group: DLRM
|
||||
tag: dlrm
|
||||
models:
|
||||
- model: DLRM v2
|
||||
mad_tag: pyt_train_dlrm
|
||||
model_repo: DLRM
|
||||
url: https://github.com/AMD-AGI/DLRMBenchmark
|
||||
training_modes: [pretrain]
|
||||
|
||||
@@ -32,7 +32,7 @@ library_groups:
|
||||
|
||||
- name: "MIGraphX"
|
||||
tag: "migraphx"
|
||||
doc_link: "amdmigraphx:reference/cpp"
|
||||
doc_link: "amdmigraphx:reference/MIGraphX-cpp"
|
||||
data_types:
|
||||
- type: "int8"
|
||||
support: "⚠️"
|
||||
@@ -290,7 +290,7 @@ library_groups:
|
||||
|
||||
- name: "Tensile"
|
||||
tag: "tensile"
|
||||
doc_link: "tensile:reference/precision-support"
|
||||
doc_link: "tensile:src/reference/precision-support"
|
||||
data_types:
|
||||
- type: "int8"
|
||||
support: "✅"
|
||||
|
||||
@@ -100,18 +100,6 @@ The table below summarizes information about ROCm-enabled deep learning framewor
|
||||
|
||||
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||
-
|
||||
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
|
||||
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
|
||||
|
||||
- .. raw:: html
|
||||
|
||||
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||
|
||||
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||
- .. raw:: html
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
|
||||
torchtune for fine-tuning and inference
|
||||
=============================================
|
||||
|
||||
`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||
`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU
|
||||
model fine-tuning and inference with LLMs.
|
||||
|
||||
#. Install torchtune using pip.
|
||||
|
||||
136
docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
Normal file → Executable file
136
docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
Normal file → Executable file
@@ -24,94 +24,102 @@ performance.
|
||||
:alt: Attention module of a large language module utilizing tiling
|
||||
:align: center
|
||||
|
||||
Installation prerequisites
|
||||
----------------------------
|
||||
|
||||
Before installing Flash Attention 2, ensure the following are available:
|
||||
|
||||
* ROCm-enabled PyTorch
|
||||
* Triton
|
||||
|
||||
These can be installed by following the official
|
||||
`PyTorch installation guide <https://pytorch.org/get-started/locally/>`_. Alternatively, for a simpler setup, you can use a preconfigured
|
||||
:ref:`ROCm PyTorch Docker image <using-docker-with-pytorch-pre-installed>`, which already includes the required libraries.
|
||||
|
||||
Installing Flash Attention 2
|
||||
----------------------------
|
||||
|
||||
ROCm provides two different implementations of Flash Attention 2 modules. They can be deployed interchangeably:
|
||||
`Flash Attention <https://github.com/Dao-AILab/flash-attention>`_ supports two backend implementations on AMD GPUs.
|
||||
|
||||
* ROCm `Composable Kernel <https://github.com/ROCm/composable_kernel/tree/develop/example/01_gemm>`_
|
||||
(CK) Flash Attention 2
|
||||
* `Composable Kernel (CK) <https://github.com/ROCm/composable_kernel>`__ - the default backend
|
||||
* `OpenAI Triton <https://github.com/triton-lang/triton>`__ - an alternative backend
|
||||
|
||||
* `OpenAI Triton <https://triton-lang.org/main/index.html>`_ Flash Attention 2
|
||||
You can switch between these backends using the environment variable ``FLASH_ATTENTION_TRITON_AMD_ENABLE``:
|
||||
|
||||
.. tab-set::
|
||||
``FLASH_ATTENTION_TRITON_AMD_ENABLE="FALSE"``
|
||||
→ Use Composable Kernel (CK) backend (Flash Attention 2)
|
||||
|
||||
.. tab-item:: CK Flash Attention 2
|
||||
``FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"``
|
||||
→ Use OpenAI Triton backend (Flash Attention 2)
|
||||
|
||||
To install CK Flash Attention 2, use the following commands.
|
||||
To install Flash Attention 2, use the following commands:
|
||||
|
||||
.. code-block:: shell
|
||||
.. code-block:: shell
|
||||
|
||||
# Install from source
|
||||
git clone https://github.com/ROCm/flash-attention.git
|
||||
cd flash-attention/
|
||||
GPU_ARCHS=gfx942 python setup.py install #MI300 Series
|
||||
git clone https://github.com/Dao-AILab/flash-attention.git
|
||||
cd flash-attention/
|
||||
pip install ninja
|
||||
|
||||
Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
|
||||
``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
|
||||
# To install the CK backend flash attention
|
||||
python setup.py install
|
||||
|
||||
.. code-block:: python
|
||||
# To install the Triton backend flash attention
|
||||
FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python setup.py install
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
model_name = "NousResearch/Meta-Llama-3-8B"
|
||||
# To install both CK and Triton backend flash attention
|
||||
FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE && FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16, use_fast=False)
|
||||
inputs = tokenizer('Today is', return_tensors='pt').to(device)
|
||||
For detailed installation instructions, see `Flash Attention <https://github.com/Dao-AILab/flash-attention>`_.
|
||||
|
||||
model_eager = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager").cuda(device)
|
||||
model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda(device)
|
||||
Benchmarking Flash Attention 2
|
||||
------------------------------
|
||||
|
||||
print("eager GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
|
||||
print("ckFAv2 GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
|
||||
Benchmark scripts to evaluate the performance of Flash Attention 2 are stored in the ``flash-attention/benchmarks/`` directory.
|
||||
|
||||
# eager GQA: Today is the day of the Lord, and we are the
|
||||
# ckFAv2 GQA: Today is the day of the Lord, and we are the
|
||||
To benchmark the CK backend
|
||||
|
||||
.. tab-item:: Triton Flash Attention 2
|
||||
.. code-block:: shell
|
||||
|
||||
The Triton Flash Attention 2 module is implemented in Python and uses OpenAI’s JIT compiler. This module has been
|
||||
upstreamed into the vLLM serving toolkit, discussed in :doc:'llm-inference-frameworks'.
|
||||
cd flash-attention/benchmarks
|
||||
pip install transformers einops ninja
|
||||
|
||||
1. To install Triton Flash Attention 2 and run the benchmark, use the following commands.
|
||||
python3 benchmark_flash_attention.py
|
||||
|
||||
.. code-block:: shell
|
||||
To benchmark the Triton backend
|
||||
|
||||
# Install from the source
|
||||
pip uninstall pytorch-triton-rocm triton -y
|
||||
git clone https://github.com/ROCm/triton.git
|
||||
cd triton/python
|
||||
GPU_ARCHS=gfx942 python setup.py install #MI300 series
|
||||
pip install matplotlib pandas
|
||||
.. code-block:: shell
|
||||
|
||||
2. To test, run the Triton Flash Attention 2 performance benchmark.
|
||||
FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python3 benchmark_flash_attention.py
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Test the triton FA v2 kernel
|
||||
python https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py
|
||||
# Results (Okay to release TFLOPS number ???)
|
||||
fused-attention-fwd-d128:
|
||||
BATCH HQ HK N_CTX_Q N_CTX_K TFLOPS
|
||||
0 16.0 16.0 16.0 1024.0 1024.0 287.528411
|
||||
1 8.0 16.0 16.0 2048.0 2048.0 287.490806
|
||||
2 4.0 16.0 16.0 4096.0 4096.0 345.966031
|
||||
3 2.0 16.0 16.0 8192.0 8192.0 361.369510
|
||||
4 1.0 16.0 16.0 16384.0 16384.0 356.873720
|
||||
5 2.0 48.0 48.0 1024.0 1024.0 216.916235
|
||||
6 2.0 48.0 48.0 2048.0 1024.0 271.027578
|
||||
7 2.0 48.0 48.0 4096.0 8192.0 337.367372
|
||||
8 2.0 48.0 48.0 8192.0 4096.0 363.481649
|
||||
9 2.0 48.0 48.0 16384.0 8192.0 375.013622
|
||||
10 8.0 16.0 16.0 1989.0 15344.0 321.791333
|
||||
11 4.0 16.0 16.0 4097.0 163.0 122.104888
|
||||
12 2.0 16.0 16.0 8122.0 2159.0 337.060283
|
||||
13 1.0 16.0 16.0 16281.0 7.0 5.234012
|
||||
14 2.0 48.0 48.0 1021.0 1020.0 214.657425
|
||||
15 2.0 48.0 48.0 2001.0 2048.0 314.429118
|
||||
16 2.0 48.0 48.0 3996.0 9639.0 330.411368
|
||||
17 2.0 48.0 48.0 8181.0 1021.0 324.614980
|
||||
Using Flash Attention 2
|
||||
-----------------------
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
model_name = "NousResearch/Llama-3.2-1B"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name, dtype=torch.bfloat16, use_fast=False)
|
||||
inputs = tokenizer('Today is', return_tensors='pt').to(device)
|
||||
|
||||
model_eager = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="eager").cuda(device)
|
||||
model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").cuda(device)
|
||||
model_eager.generation_config.pad_token_id = model_eager.generation_config.eos_token_id
|
||||
model_ckFAv2.generation_config.pad_token_id = model_ckFAv2.generation_config.eos_token_id
|
||||
|
||||
print("eager\n GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
|
||||
print("ckFAv2\n GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
|
||||
|
||||
The outputs from eager mode and FlashAttention-2 are identical, although their performance behavior differs.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
eager
|
||||
GQA: Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.
|
||||
ckFAv2
|
||||
GQA: Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.
|
||||
|
||||
xFormers
|
||||
========
|
||||
|
||||
@@ -0,0 +1,472 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
|
||||
:keywords: model, MAD, automation, dashboarding, validate
|
||||
|
||||
**********************************
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||
|
||||
.. _vllm-benchmark-unified-docker-1103:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
|
||||
prebuilt, optimized environment for validating large language model (LLM)
|
||||
inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
|
||||
GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
|
||||
specifically for AMD data center GPUs and includes the following components:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-1103>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||
|
||||
* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
|
||||
|
||||
* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
|
||||
|
||||
* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
|
||||
|
||||
.. _vllm-benchmark-supported-models-1103:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-1103:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
documentation might vary by model -- select one to get started. MXFP4 models
|
||||
are only supported on MI355X and MI350X GPUs.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-1103:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
|
||||
{% if model.precision == "float4" %}
|
||||
.. important::
|
||||
|
||||
MXFP4 is supported only on MI355X and MI350X GPUs.
|
||||
{% endif %}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
{% if model.precision == "float4" and model.model_repo.startswith("amd") %}
|
||||
This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-1103:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-1103:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node with the
|
||||
:literal:`{{model.precision}}` data type.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-1103>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
|
||||
{% if model.tunableop %}
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||
operators to find the fastest one for your hardware.
|
||||
|
||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||
the ``--tunableop on`` argument in your run.
|
||||
|
||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||
performance-collection run.
|
||||
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
For more information on configuration, see the `config files
|
||||
<https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
|
||||
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||
for descriptions of available configuration options
|
||||
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||
additional benchmarking information.
|
||||
|
||||
.. rubric:: Launch the container
|
||||
|
||||
You can run the vLLM benchmark tool independently by starting the
|
||||
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||
in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
docker run -it \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--shm-size 16G \
|
||||
--security-opt seccomp=unconfined \
|
||||
--security-opt apparmor=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
-v $(pwd):/workspace \
|
||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||
--name test \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. rubric:: Throughput command
|
||||
|
||||
Use the following command to start the throughput benchmark.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
num_prompts={{ model.config.num_prompts | default(1024) }}
|
||||
in={{ model.config.in | default(128) }}
|
||||
out={{ model.config.in | default(128) }}
|
||||
dtype={{ model.config.dtype | default("auto") }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm bench throughput --model $model \
|
||||
-tp $tp \
|
||||
--num-prompts $num_prompts \
|
||||
--input-len $in \
|
||||
--output-len $out \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--trust-remote-code \
|
||||
--output-json ${model}_throughput.json \
|
||||
--gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
|
||||
|
||||
.. rubric:: Serving command
|
||||
|
||||
1. Start the server using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
model={{ model.model_repo }}
|
||||
tp={{ model.config.tp }}
|
||||
dtype={{ model.config.dtype }}
|
||||
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||
max_num_seqs=256
|
||||
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||
max_model_len={{ model.config.max_model_len }}
|
||||
|
||||
vllm serve $model \
|
||||
-tp $tp \
|
||||
--dtype $dtype \
|
||||
--kv-cache-dtype $kv_cache_dtype \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--max-model-len $max_model_len \
|
||||
--no-enable-prefix-caching \
|
||||
--swap-space 16 \
|
||||
--disable-log-requests \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
Wait until the model has loaded and the server is ready to accept requests.
|
||||
|
||||
2. On another terminal on the same machine, run the benchmark:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# Connect to the container
|
||||
docker exec -it test bash
|
||||
|
||||
# Wait for the server to start
|
||||
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||
|
||||
# Run the benchmark
|
||||
model={{ model.model_repo }}
|
||||
max_concurrency=1
|
||||
num_prompts=10
|
||||
in=128
|
||||
out=128
|
||||
vllm bench serve --model $model \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--dataset-name random \
|
||||
--ignore-eos \
|
||||
--max-concurrency $max_concurrency \
|
||||
--num-prompts $num_prompts \
|
||||
--random-input-len $in \
|
||||
--random-output-len $out \
|
||||
--trust-remote-code \
|
||||
--save-result \
|
||||
--result-filename ${model}_serving.json
|
||||
|
||||
.. note::
|
||||
|
||||
For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
|
||||
try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
|
||||
|
||||
If you encounter the following error, pass your access-authorized Hugging
|
||||
Face token to the gated models.
|
||||
|
||||
.. code-block::
|
||||
|
||||
OSError: You are trying to access a gated repo.
|
||||
|
||||
# pass your HF_TOKEN
|
||||
export HF_TOKEN=$your_personal_hf_token
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
mjx-container[jax="CHTML"][display="true"] {
|
||||
text-align: left;
|
||||
margin: 0;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. note::
|
||||
|
||||
Throughput is calculated as:
|
||||
|
||||
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||
|
||||
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Advanced usage
|
||||
==============
|
||||
|
||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
|
||||
|
||||
Reproducing the Docker image
|
||||
----------------------------
|
||||
|
||||
To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
|
||||
|
||||
1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
|
||||
2. Use the following command to build the image directly from the specified commit.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
.. code-block:: shell
|
||||
|
||||
docker build -f docker/Dockerfile.rocm \
|
||||
--build-arg REMOTE_VLLM=1 \
|
||||
--build-arg VLLM_REPO=https://github.com/ROCm/vllm \
|
||||
--build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
|
||||
-t vllm-rocm .
|
||||
|
||||
.. tip::
|
||||
|
||||
Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about the options for latency and throughput benchmark scripts,
|
||||
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||
a brief introduction to vLLM and optimization strategies.
|
||||
|
||||
- For application performance optimization strategies for HPC and AI workloads,
|
||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`vllm-history` to find documentation for previous releases
|
||||
of the ``ROCm/vllm`` Docker image.
|
||||
@@ -16,15 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251024``
|
||||
(latest)
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210``
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.11.2
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7>`__
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* vLLM 0.11.1
|
||||
* PyTorch 2.9.0
|
||||
-
|
||||
* :doc:`Documentation <../vllm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
|
||||
* :doc:`Documentation <vllm-0.11.1-20251103>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506>`__
|
||||
|
||||
* - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
|
||||
-
|
||||
|
||||
@@ -0,0 +1,398 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. _xdit-video-diffusion-2510:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
|
||||
a prebuilt, optimized inference environment based on `xDiT
|
||||
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion-based
|
||||
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
|
||||
and MI300X (gfx942) GPUs.
|
||||
This image is based on ROCm {{docker.ROCm}} preview release via `TheRock <https://github.com/ROCm/TheRock>`_
|
||||
and includes the following software components:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
- Initial ROCm-enabled xDiT Docker release for diffusion inference.
|
||||
- Supported architectures: gfx942 and gfx950 (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X).
|
||||
- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models.
|
||||
|
||||
.. _xdit-video-diffusion-supported-models-2510:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length == 1 %}
|
||||
<div class="col-12 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA
|
||||
auto-balancing, you can skip this step. Otherwise, complete the procedures in
|
||||
the `System validation and optimization
|
||||
<https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.html>`__
|
||||
guide to properly configure your system settings before starting.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups %}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models-2510` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-setup-2510:
|
||||
|
||||
Prepare the model
|
||||
-----------------
|
||||
|
||||
.. note::
|
||||
|
||||
If you're using ROCm MAD to :ref:`run your model
|
||||
<xdit-video-diffusion-run-2510>`, you can skip this section. MAD will handle
|
||||
starting the container and downloading required models inside the container.
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-run-2510:
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
You can benchmark models through `MAD <https://github.com/ROCm/MAD>`__-integrated automation or standalone
|
||||
torchrun commands.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model tencent/HunyuanVideo \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd Wan2.1
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
|
||||
--image "/app/Wan2.1/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd Wan2.2
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-A14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
|
||||
--image "/app/Wan2.2/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model black-forest-labs/FLUX.1-dev \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 1 \
|
||||
--benchmark_output_directory results
|
||||
|
||||
{% endif %}
|
||||
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see `AMD
|
||||
Infinity Hub
|
||||
<https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`__.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`xdit-history` to find documentation for previous releases
|
||||
of xDiT diffusion inference performance testing.
|
||||
@@ -0,0 +1,389 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See
|
||||
:doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
|
||||
version.
|
||||
|
||||
.. _xdit-video-diffusion-2511:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
|
||||
benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
|
||||
The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
|
||||
and includes the following components:
|
||||
|
||||
.. dropdown:: Software components
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for item in docker.whats_new %}
|
||||
* {{ item }}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-supported-models-2511:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups %}
|
||||
|
||||
{# Create a lookup for supported models #}
|
||||
{% set supported_lookup = {} %}
|
||||
{% for supported in docker.supported_models %}
|
||||
{% set _ = supported_lookup.update({supported.group: supported.models}) %}
|
||||
{% endfor %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% if model_group.group in supported_lookup %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% if model_group.group in supported_lookup %}
|
||||
{% set supported_models = supported_lookup[model_group.group] %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if model.model in supported_models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.page_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.page_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models-2511` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Choose your setup method
|
||||
------------------------
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.page_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
|
||||
|
||||
{% set model_groups = data.xdit_diffusion_inference.model_groups%}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.page_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model tencent/HunyuanVideo \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd Wan2.1
|
||||
mkdir results
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
|
||||
--image "/app/Wan2.1/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd Wan2.2
|
||||
mkdir results
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--task i2v-A14B \
|
||||
--size 720*1280 --frame_num 81 \
|
||||
--ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
|
||||
--image "/app/Wan2.2/examples/i2v_input.JPG" \
|
||||
--ulysses_size 8 --ring_size 1 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
|
||||
--offload_model 0 \
|
||||
--vae_dtype bfloat16 \
|
||||
--allow_tf32 \
|
||||
--compile
|
||||
{% endif %}
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd Flux
|
||||
mkdir results
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model black-forest-labs/FLUX.1-dev \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 1 \
|
||||
--benchmark_output_directory results
|
||||
{% endif %}
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See
|
||||
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
|
||||
to find documentation for previous releases of xDiT diffusion inference
|
||||
performance testing.
|
||||
@@ -0,0 +1,411 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm vLLM
|
||||
inference performance documentation. See
|
||||
:doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
|
||||
version.
|
||||
|
||||
.. _xdit-video-diffusion-2512:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
|
||||
a prebuilt, optimized environment based on `xDiT
|
||||
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
|
||||
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
|
||||
and MI300X (gfx942) GPUs.
|
||||
|
||||
The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
|
||||
and includes the following components:
|
||||
|
||||
.. dropdown:: Software components
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_data in docker.components.items() %}
|
||||
* - `{{ component_name }} <{{ component_data.url }}>`_
|
||||
- {{ component_data.version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for item in docker.whats_new %}
|
||||
* {{ item }}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-supported-models-2512:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in docker.supported_models %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.js_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.js_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Choose your setup method
|
||||
------------------------
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{model.js_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.js_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd Wan
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Wan/run.py \
|
||||
--task i2v \
|
||||
--height 720 \
|
||||
--width 1280 \
|
||||
--model {{ model.model_repo }} \
|
||||
--img_file_path /app/Wan/i2v_input.JPG \
|
||||
--ulysses_degree 8 \
|
||||
--seed 42 \
|
||||
--num_frames 81 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--num_repetitions 1 \
|
||||
--num_inference_steps 40 \
|
||||
--use_torch_compile
|
||||
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd Wan
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Wan/run.py \
|
||||
--task i2v \
|
||||
--height 720 \
|
||||
--width 1280 \
|
||||
--model {{ model.model_repo }} \
|
||||
--img_file_path /app/Wan/i2v_input.JPG \
|
||||
--ulysses_degree 8 \
|
||||
--seed 42 \
|
||||
--num_frames 81 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--num_repetitions 1 \
|
||||
--num_inference_steps 40 \
|
||||
--use_torch_compile
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 50
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "stable-diffusion-3.5-large" %}
|
||||
cd StableDiffusion3.5
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--num_inference_steps 28 \
|
||||
--prompt "A capybara holding a sign that reads Hello World" \
|
||||
--use_torch_compile \
|
||||
--pipefusion_parallel_degree 4 \
|
||||
--use_cfg_parallel \
|
||||
--num_repetitions 50 \
|
||||
--dtype torch.float16 \
|
||||
--output_path results
|
||||
|
||||
{% endif %}
|
||||
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See
|
||||
:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
|
||||
to find documentation for previous releases of xDiT diffusion inference
|
||||
performance testing.
|
||||
@@ -0,0 +1,47 @@
|
||||
:orphan:
|
||||
|
||||
************************************************************
|
||||
xDiT diffusion inference performance testing version history
|
||||
************************************************************
|
||||
|
||||
This table lists previous versions of the ROCm xDiT diffusion inference performance
|
||||
testing environment. For detailed information about available models for
|
||||
benchmarking, see the version-specific documentation.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Docker image tag
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.13`` (latest)
|
||||
-
|
||||
* TheRock 1728a81
|
||||
-
|
||||
* :doc:`Documentation <../../xdit-diffusion-inference>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef>`__
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.12``
|
||||
-
|
||||
* `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
|
||||
* TheRock 3e3f834
|
||||
-
|
||||
* :doc:`Documentation <xdit-25.12>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256>`__
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.11``
|
||||
-
|
||||
* `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
|
||||
* TheRock 3e3f834
|
||||
-
|
||||
* :doc:`Documentation <xdit-25.11>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216>`__
|
||||
|
||||
* - ``rocm/pytorch-xdit:v25.10``
|
||||
-
|
||||
* `ROCm 7.9.0 preview <https://rocm.docs.amd.com/en/7.9.0-preview/about/release-notes.html>`__
|
||||
* TheRock 7afbe45
|
||||
-
|
||||
* :doc:`Documentation <xdit-25.10>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e>`__
|
||||
@@ -6,7 +6,7 @@
|
||||
vLLM inference performance testing
|
||||
**********************************
|
||||
|
||||
.. _vllm-benchmark-unified-docker-1024:
|
||||
.. _vllm-benchmark-unified-docker-1210:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||
|
||||
@@ -34,21 +34,18 @@ vLLM inference performance testing
|
||||
{% endfor %}
|
||||
|
||||
With this Docker image, you can quickly test the :ref:`expected
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-1024>` for
|
||||
inference performance numbers <vllm-benchmark-performance-measurements-1210>` for
|
||||
AMD Instinct GPUs.
|
||||
|
||||
What's new
|
||||
==========
|
||||
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM
|
||||
Docker release <previous-versions/vllm-history>`.
|
||||
|
||||
* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
|
||||
- Improved performance on Llama 3 MXFP4 through AITER optimizations and improved kernel fusion.
|
||||
|
||||
* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
|
||||
|
||||
* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
|
||||
|
||||
.. _vllm-benchmark-supported-models-1024:
|
||||
.. _vllm-benchmark-supported-models-1210:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -58,7 +55,7 @@ Supported models
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. _vllm-benchmark-available-models-1024:
|
||||
.. _vllm-benchmark-available-models-1210:
|
||||
|
||||
The following models are supported for inference performance benchmarking
|
||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||
@@ -94,7 +91,7 @@ Supported models
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _vllm-benchmark-vllm-1024:
|
||||
.. _vllm-benchmark-vllm-1210:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -108,6 +105,15 @@ Supported models
|
||||
MXFP4 is supported only on MI355X and MI350X GPUs.
|
||||
{% endif %}
|
||||
|
||||
{% if model.mad_tag in ["pyt_vllm_mixtral-8x7b", "pyt_vllm_mixtral-8x7b_fp8", "pyt_vllm_mixtral-8x22b", "pyt_vllm_mixtral-8x22b_fp8", "pyt_vllm_deepseek-r1"] %}
|
||||
.. caution::
|
||||
|
||||
There is a known regression with AITER for MoE models such as Mixtral and
|
||||
DeepSeek-R1. Consider using the :doc:`previous release
|
||||
<previous-versions/vllm-0.11.1-20251103>`
|
||||
``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103`` for better performance.
|
||||
{% endif %}
|
||||
|
||||
.. note::
|
||||
|
||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||
@@ -122,7 +128,7 @@ Supported models
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _vllm-benchmark-performance-measurements-1024:
|
||||
.. _vllm-benchmark-performance-measurements-1210:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -178,7 +184,7 @@ Benchmarking
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. _vllm-benchmark-mad-1024:
|
||||
.. _vllm-benchmark-mad-1210:
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
@@ -190,7 +196,7 @@ Benchmarking
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -219,7 +225,7 @@ Benchmarking
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
Although the :ref:`available models
|
||||
<vllm-benchmark-available-models-1024>` are preconfigured to collect
|
||||
<vllm-benchmark-available-models-1210>` are preconfigured to collect
|
||||
offline throughput and online serving performance data, you can
|
||||
also change the benchmarking parameters. See the standalone
|
||||
benchmarking tab for more information.
|
||||
@@ -244,7 +250,7 @@ Benchmarking
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}.
|
||||
See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
|
||||
See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.
|
||||
|
||||
.. seealso::
|
||||
|
||||
@@ -438,6 +444,14 @@ To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
|
||||
|
||||
Replace ``vllm-rocm`` with your desired image tag.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
There is a known regression with AITER for MoE models such as Mixtral and
|
||||
DeepSeek-R1. Consider using the :doc:`previous release
|
||||
<previous-versions/vllm-0.11.1-20251103>`
|
||||
(``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``) for better performance.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -26,4 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
|
||||
|
||||
- :doc:`SGLang inference performance testing <benchmark-docker/sglang>`
|
||||
|
||||
- :doc:`xDiT diffusion inference <xdit-diffusion-inference>`
|
||||
|
||||
- :doc:`Deploying your model <deploy-your-model>`
|
||||
|
||||
462
docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
Normal file
462
docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
Normal file
@@ -0,0 +1,462 @@
|
||||
.. meta::
|
||||
:description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
|
||||
prebuilt and optimized docker images.
|
||||
:keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
|
||||
|
||||
************************
|
||||
xDiT diffusion inference
|
||||
************************
|
||||
|
||||
.. _xdit-video-diffusion:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
|
||||
a prebuilt, optimized environment based on `xDiT
|
||||
<https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
|
||||
video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
|
||||
and MI300X (gfx942) GPUs.
|
||||
|
||||
The image runs a preview version of ROCm using the new `TheRock
|
||||
<https://github.com/ROCm/TheRock>`__ build system and includes the following
|
||||
components:
|
||||
|
||||
.. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_data in docker.components.items() %}
|
||||
* - `{{ component_name }} <{{ component_data.url }}>`_
|
||||
- {{ component_data.version }}
|
||||
{% endfor %}
|
||||
|
||||
Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
|
||||
For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
|
||||
|
||||
What's new
|
||||
==========
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for item in docker.whats_new %}
|
||||
* {{ item }}
|
||||
{% endfor %}
|
||||
|
||||
.. _xdit-video-diffusion-supported-models:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are supported for inference performance benchmarking.
|
||||
Some instructions, commands, and recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in docker.supported_models %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.js_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
|
||||
or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
|
||||
external license agreement through a third party.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the `Performance results with AMD ROCm software
|
||||
<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
|
||||
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||
|
||||
.. important::
|
||||
|
||||
The performance data presented in `Performance results with AMD ROCm
|
||||
software
|
||||
<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
|
||||
only reflects the latest version of this inference benchmarking environment.
|
||||
The listed measurements should not be interpreted as the peak performance
|
||||
achievable by AMD Instinct GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
|
||||
Pull the image using the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
Validate and benchmark
|
||||
======================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
Once the image has been downloaded you can follow these steps to
|
||||
run benchmarks and generate outputs.
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.js_tag}}
|
||||
|
||||
The following commands are written for {{ model.model }}.
|
||||
See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Choose your setup method
|
||||
------------------------
|
||||
|
||||
You can either use an existing Hugging Face cache or download the model fresh inside the container.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
.. container:: model-doc {{model.js_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: Option 1: Use existing Hugging Face cache
|
||||
|
||||
If you already have models downloaded on your host system, you can mount your existing cache.
|
||||
|
||||
1. Set your Hugging Face cache location.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/your/hf_cache/location
|
||||
|
||||
2. Download the model (if not already cached).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
3. Launch the container with mounted cache.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
-e HF_HOME=/app/huggingface_models \
|
||||
-v $HF_HOME:/app/huggingface_models \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. tab-item:: Option 2: Download inside container
|
||||
|
||||
If you prefer to keep the container self-contained or don't have an existing cache.
|
||||
|
||||
1. Launch the container
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run \
|
||||
-it --rm \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--user root \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--group-add video \
|
||||
--ipc=host \
|
||||
--network host \
|
||||
--privileged \
|
||||
--shm-size 128G \
|
||||
--name pytorch-xdit \
|
||||
-e HSA_NO_SCRATCH_RECLAIM=1 \
|
||||
-e OMP_NUM_THREADS=16 \
|
||||
-e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
2. Inside the container, set the Hugging Face cache location and download the model.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_HOME=/app/huggingface_models
|
||||
huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
|
||||
|
||||
.. warning::
|
||||
|
||||
Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Run inference
|
||||
=============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
|
||||
{% for model_group in docker.supported_models %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.js_tag }}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. On the host machine, use this command to run the performance benchmark test on
|
||||
the `{{model.model}} <{{ model.url }}>`_ model using one node.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||
and ``{{ model.mad_tag }}_serving.csv``.
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
To run the benchmarks for {{ model.model }}, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
{% if model.model == "Hunyuan Video" %}
|
||||
cd /app/Hunyuanvideo
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--prompt "In the large cage, two puppies were wagging their tails at each other." \
|
||||
--height 720 --width 1280 --num_frames 129 \
|
||||
--num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
|
||||
--ulysses_degree 8 \
|
||||
--enable_tiling --enable_slicing \
|
||||
--use_torch_compile \
|
||||
--bench_output results
|
||||
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.1" %}
|
||||
cd /app/Wan
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Wan/run.py \
|
||||
--task i2v \
|
||||
--height 720 \
|
||||
--width 1280 \
|
||||
--model {{ model.model_repo }} \
|
||||
--img_file_path /app/Wan/i2v_input.JPG \
|
||||
--ulysses_degree 8 \
|
||||
--seed 42 \
|
||||
--num_frames 81 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--num_repetitions 1 \
|
||||
--num_inference_steps 40 \
|
||||
--use_torch_compile
|
||||
|
||||
{% endif %}
|
||||
{% if model.model == "Wan2.2" %}
|
||||
cd /app/Wan
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Wan/run.py \
|
||||
--task i2v \
|
||||
--height 720 \
|
||||
--width 1280 \
|
||||
--model {{ model.model_repo }} \
|
||||
--img_file_path /app/Wan/i2v_input.JPG \
|
||||
--ulysses_degree 8 \
|
||||
--seed 42 \
|
||||
--num_frames 81 \
|
||||
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
|
||||
--num_repetitions 1 \
|
||||
--num_inference_steps 40 \
|
||||
--use_torch_compile
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}
|
||||
cd /app/Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--seed 42 \
|
||||
--prompt "A small cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 25 \
|
||||
--max_sequence_length 256 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--num_repetitions 50
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1 Kontext" %}
|
||||
cd /app/Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--seed 42 \
|
||||
--prompt "Add a cool hat to the cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 30 \
|
||||
--max_sequence_length 512 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--img_file_path /app/Flux/cat.png \
|
||||
--model_type flux_kontext \
|
||||
--guidance_scale 2.5 \
|
||||
--num_repetitions 25
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.2" %}
|
||||
cd /app/Flux
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--seed 42 \
|
||||
--prompt "Add a cool hat to the cat" \
|
||||
--height 1024 \
|
||||
--width 1024 \
|
||||
--num_inference_steps 50 \
|
||||
--max_sequence_length 512 \
|
||||
--warmup_steps 5 \
|
||||
--no_use_resolution_binning \
|
||||
--ulysses_degree 8 \
|
||||
--use_torch_compile \
|
||||
--img_file_paths /app/Flux/cat.png \
|
||||
--model_type flux2 \
|
||||
--guidance_scale 4.0 \
|
||||
--num_repetitions 25
|
||||
|
||||
{% endif %}
|
||||
|
||||
{% if model.model == "stable-diffusion-3.5-large" %}
|
||||
cd /app/StableDiffusion3.5
|
||||
mkdir results
|
||||
|
||||
torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
|
||||
--model {{ model.model_repo }} \
|
||||
--num_inference_steps 28 \
|
||||
--prompt "A capybara holding a sign that reads Hello World" \
|
||||
--use_torch_compile \
|
||||
--pipefusion_parallel_degree 4 \
|
||||
--use_cfg_parallel \
|
||||
--num_repetitions 50 \
|
||||
--dtype torch.float16 \
|
||||
--output_path results
|
||||
|
||||
{% endif %}
|
||||
|
||||
The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
|
||||
|
||||
{% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
|
||||
of xDiT diffusion inference performance testing.
|
||||
@@ -33,18 +33,15 @@ It includes the following software components:
|
||||
- {{ component_version }}
|
||||
|
||||
{% endfor %}
|
||||
{% if jax_version == "0.6.0" %}
|
||||
.. note::
|
||||
|
||||
Shardy is a new config in JAX 0.6.0. You might get related errors if it's
|
||||
not configured correctly. For now you can turn it off by setting
|
||||
``shardy=False`` during the training run. You can also follow the `migration
|
||||
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
|
||||
it.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
The ``rocm/jax-training:maxtext-v25.9`` has been updated to
|
||||
``rocm/jax-training:maxtext-v25.9.1``. This revision should include
|
||||
a fix to address segmentation fault issues during launch. See the
|
||||
:doc:`versioned documentation <previous-versions/jax-maxtext-v25.9>`.
|
||||
|
||||
MaxText with on ROCm provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
@@ -57,7 +54,7 @@ MaxText with on ROCm provides the following key features to train large language
|
||||
|
||||
- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
|
||||
|
||||
.. _amd-maxtext-model-support-v259:
|
||||
.. _amd-maxtext-model-support-v25.11:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -139,7 +136,7 @@ Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
.. _amd-maxtext-multi-node-setup-v259:
|
||||
.. _amd-maxtext-multi-node-setup-v25.11:
|
||||
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
@@ -147,7 +144,7 @@ Multi-node configuration
|
||||
See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
|
||||
environment for multi-node training.
|
||||
|
||||
.. _amd-maxtext-get-started-v259:
|
||||
.. _amd-maxtext-get-started-v25.11:
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
@@ -172,7 +169,7 @@ benchmark results:
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-maxtext-model-support-v25.11` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -203,7 +200,7 @@ benchmark results:
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}. See
|
||||
:ref:`amd-maxtext-model-support-v259` to switch to another
|
||||
:ref:`amd-maxtext-model-support-v25.11` to switch to another
|
||||
available model. Some instructions and resources might not be
|
||||
available for all models and configurations.
|
||||
|
||||
@@ -325,15 +322,67 @@ benchmark results:
|
||||
|
||||
sbatch -N <num_nodes> {{ model.multinode_training_script }}
|
||||
|
||||
.. rubric:: Profiling with rocprofv3
|
||||
|
||||
If you need to collect a trace and the JAX profiler isn't working, use ``rocprofv3`` provided by the :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
rocprofv3 \
|
||||
--hip-trace \
|
||||
--kernel-trace \
|
||||
--memory-copy-trace \
|
||||
--rccl-trace \
|
||||
--output-format pftrace \
|
||||
-d ./v3_traces \ # output directory
|
||||
-- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
|
||||
|
||||
You can set the directory where you want the .json traces to be
|
||||
saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
|
||||
opened in Perfetto: `<https://ui.perfetto.dev/>`__.
|
||||
|
||||
{% else %}
|
||||
.. rubric:: Multi-node training
|
||||
|
||||
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
|
||||
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v25.11`
|
||||
with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
- Minor performance regression (< 4%) for BF16 quantization in Llama models and Mixtral 8x7b.
|
||||
|
||||
- You might see minor loss spikes, or loss curve may have slightly higher
|
||||
convergence end values compared to the previous ``jax-training`` image.
|
||||
|
||||
- For FP8 training on MI355, many models will display a warning message like:
|
||||
``Warning: Latency not found for MI_M=16, MI_N=16, MI_K=128,
|
||||
mi_input_type=BFloat8Float8_fnuz. Returning latency value of 32 (really
|
||||
slow).`` The compile step may take longer than usual, but training will run.
|
||||
This will be fixed in a future release.
|
||||
|
||||
- The built-in JAX profiler isn't working. If you need to collect a trace and
|
||||
the JAX profiler isn't working, use ``rocprofv3`` provided by the
|
||||
:doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
rocprofv3 \
|
||||
--hip-trace \
|
||||
--kernel-trace \
|
||||
--memory-copy-trace \
|
||||
--rccl-trace \
|
||||
--output-format pftrace \
|
||||
-d ./v3_traces \ # output directory
|
||||
-- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
|
||||
|
||||
You can set the directory where you want the .json traces to be
|
||||
saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
|
||||
opened in Perfetto: `<https://ui.perfetto.dev/>`__.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -36,12 +36,10 @@ accelerate training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -49,12 +47,12 @@ accelerate training workloads:
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
.. _amd-megatron-lm-model-support:
|
||||
|
||||
.. _amd-megatron-lm-model-support-v25.11:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -99,7 +97,7 @@ accelerate training workloads:
|
||||
Some models, such as Llama, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
.. _amd-megatron-lm-performance-measurements:
|
||||
.. _amd-megatron-lm-performance-measurements-v25.11:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -131,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-megatron-lm-training:
|
||||
.. _mi300x-amd-megatron-lm-training-v25.11:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
@@ -140,52 +138,38 @@ Use the following instructions to set up the environment, configure the script t
|
||||
reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
|
||||
image.
|
||||
|
||||
.. _amd-megatron-lm-requirements:
|
||||
.. _amd-megatron-lm-requirements-v25.11:
|
||||
|
||||
Download the Docker image
|
||||
-------------------------
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = data.docker %}
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 128G \
|
||||
--name megatron_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 128G \
|
||||
--name megatron_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
|
||||
|
||||
@@ -206,7 +190,7 @@ Download the Docker image
|
||||
The Docker container hosts a verified commit of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
|
||||
|
||||
.. _amd-megatron-lm-environment-setup:
|
||||
.. _amd-megatron-lm-environment-setup-v25.11:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
@@ -216,39 +200,39 @@ Configuration
|
||||
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||
|
||||
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||
|
||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||
|
||||
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
|
||||
directory of
|
||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
|
||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||
|
||||
.. note::
|
||||
|
||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v25.11>` for more information on configuration options.
|
||||
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
@@ -256,7 +240,7 @@ Multi-node configuration
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
|
||||
|
||||
.. _amd-megatron-lm-tokenizer:
|
||||
.. _amd-megatron-lm-tokenizer-v25.11:
|
||||
|
||||
Tokenizer
|
||||
---------
|
||||
@@ -393,7 +377,7 @@ Download the dataset
|
||||
|
||||
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
|
||||
Remember to either pre-download the tokenizer or setup Hugging Face access
|
||||
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
|
||||
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v25.11>` section.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -495,15 +479,38 @@ Download the dataset
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. _amd-megatron-lm-run-training:
|
||||
.. _amd-megatron-lm-run-training-v25.11:
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Use the following example commands to set up the environment, configure
|
||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
|
||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v25.11>`, and run training on
|
||||
MI300X Series GPUs with the AMD Megatron-LM environment.
|
||||
|
||||
Before starting training, export the following environment variables.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
.. tab-item:: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
|
||||
# Set this on MI325X/MI300X only
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
Single node training
|
||||
--------------------
|
||||
|
||||
@@ -913,7 +920,7 @@ Single node training
|
||||
RECOMPUTE_ACTIVATIONS=full \
|
||||
CKPT_FORMAT=torch_dist
|
||||
|
||||
.. _amd-megatron-lm-multi-node-examples:
|
||||
.. _amd-megatron-lm-multi-node-examples-v25.11:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
@@ -964,7 +971,7 @@ training on 16 nodes, try the following command:
|
||||
|
||||
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
|
||||
|
||||
.. _amd-megatron-lm-benchmark-test-vars:
|
||||
.. _amd-megatron-lm-benchmark-test-vars-v25.11:
|
||||
|
||||
Key options
|
||||
-----------
|
||||
@@ -1029,11 +1036,6 @@ The benchmark tests support the following sets of variables.
|
||||
``RECOMPUTE_NUM_LAYERS``
|
||||
Number of layers used for checkpointing recompute.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
|
||||
@@ -17,13 +17,22 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - 25.9 (latest)
|
||||
* - 25.11
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* JAX 0.7.1
|
||||
-
|
||||
* :doc:`Documentation <../jax-maxtext>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a>`__
|
||||
|
||||
* - 25.9.1
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* JAX 0.6.2
|
||||
-
|
||||
* :doc:`Documentation <../jax-maxtext>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
|
||||
* :doc:`Documentation <jax-maxtext-v25.9>`
|
||||
* `Docker Hub (25.9.1) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6>`__
|
||||
* `Docker Hub (25.9) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9/images/sha256-4bb16ab58279ef09cb7a5e362c38e3fe3f901de44d8dbac5d0cb3bac5686441e>`__
|
||||
|
||||
* - 25.7
|
||||
-
|
||||
|
||||
@@ -24,7 +24,7 @@ provides a prebuilt environment for training on AMD Instinct MI300X and MI325X G
|
||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||
It includes the following software components:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
@@ -80,7 +80,7 @@ series GPUs. Some instructions, commands, and available training
|
||||
configurations in this documentation might vary by model -- select one to get
|
||||
started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
@@ -144,7 +144,7 @@ Pull the Docker image
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
@@ -177,7 +177,7 @@ Benchmarking
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
|
||||
|
||||
.. _vllm-benchmark-mad:
|
||||
|
||||
|
||||
@@ -0,0 +1,365 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using JAX MaxText for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
||||
|
||||
******************************************
|
||||
Training a model with JAX MaxText on ROCm
|
||||
******************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm JAX MaxText
|
||||
training performance documentation. See :doc:`../jax-maxtext` for the latest version.
|
||||
|
||||
.. note::
|
||||
|
||||
We have refreshed the ``rocm/jax-training:maxtext-v25.9`` image as
|
||||
`rocm/jax-training:maxtext-v25.9.1`. This should include a fix to address
|
||||
segmentation fault issues during launch.
|
||||
|
||||
The MaxText for ROCm training Docker image
|
||||
provides a prebuilt environment for training on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs,
|
||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||
It includes the following software components:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for docker in dockers %}
|
||||
{% set jax_version = docker.components["JAX"] %}
|
||||
|
||||
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||
:sync: {{ docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
|
||||
{% endfor %}
|
||||
{% if jax_version == "0.6.0" %}
|
||||
.. note::
|
||||
|
||||
Shardy is a new config in JAX 0.6.0. You might get related errors if it's
|
||||
not configured correctly. For now you can turn it off by setting
|
||||
``shardy=False`` during the training run. You can also follow the `migration
|
||||
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
|
||||
it.
|
||||
{% endif %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
MaxText with on ROCm provides the following key features to train large language models efficiently:
|
||||
|
||||
- Transformer Engine (TE)
|
||||
|
||||
- Flash Attention (FA) 3 -- with or without sequence input packing
|
||||
|
||||
- GEMM tuning
|
||||
|
||||
- Multi-node support
|
||||
|
||||
- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
|
||||
|
||||
.. _amd-maxtext-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on AMD Instinct
|
||||
GPUs. Some instructions, commands, and available training
|
||||
configurations in this documentation might vary by model -- select one to get
|
||||
started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. note::
|
||||
|
||||
Some models, such as Llama 3, require an external license agreement through
|
||||
a third party (for example, Meta).
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
as follows. Performance can vary for other training workloads, as AMD
|
||||
doesn’t validate configurations and run conditions outside those described.
|
||||
|
||||
Pull the Docker image
|
||||
---------------------
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
.. _amd-maxtext-multi-node-setup-v259:
|
||||
|
||||
Multi-node configuration
|
||||
------------------------
|
||||
|
||||
See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
|
||||
environment for multi-node training.
|
||||
|
||||
.. _amd-maxtext-get-started-v259:
|
||||
|
||||
Benchmarking
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between two options to reproduce the
|
||||
benchmark results:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
|
||||
|
||||
.. _vllm-benchmark-mad:
|
||||
|
||||
{% set docker = data.dockers[0] %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{model.mad_tag}}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% if model.mad_tag and "single-node" in model.doc_options %}
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. Use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{model.mad_tag}} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||
model are collected in the following path: ``~/MAD/perf.csv/``.
|
||||
{% endif %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
The following commands are optimized for {{ model.model }}. See
|
||||
:ref:`amd-maxtext-model-support-v259` to switch to another
|
||||
available model. Some instructions and resources might not be
|
||||
available for all models and configurations.
|
||||
|
||||
.. rubric:: Download the Docker image and required scripts
|
||||
|
||||
Run the JAX MaxText benchmark tool independently by starting the
|
||||
Docker container as shown in the following snippet.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
{% if model.model_repo and "single-node" in model.doc_options %}
|
||||
.. rubric:: Single node training
|
||||
|
||||
1. Set up environment variables.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
|
||||
export HF_HOME=<Location of saved/cached Hugging Face models>
|
||||
|
||||
``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
|
||||
See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
|
||||
|
||||
``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
|
||||
If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
|
||||
Downloaded files typically get cached to ``~/.cache/huggingface``.
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device=/dev/dri \
|
||||
--device=/dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
-v $HF_HOME:/hf_cache \
|
||||
-e HF_HOME=/hf_cache \
|
||||
-e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
3. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||
benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/jax-maxtext
|
||||
|
||||
4. Run the setup scripts to install libraries and datasets needed
|
||||
for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
|
||||
|
||||
5. To run the training benchmark without quantization, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
|
||||
|
||||
For quantized training, run the script with the appropriate option for your Instinct GPU.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
|
||||
For ``fp8`` quantized training on MI355X and MI350X GPUs, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q fp8
|
||||
|
||||
{% if model.model_repo not in ["Llama-3.1-70B", "Llama-3.3-70B"] %}
|
||||
.. tab-item:: MI325X and MI300X
|
||||
|
||||
For ``nanoo_fp8`` quantized training on MI300X series GPUs, use the following command:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
|
||||
{% endif %}
|
||||
|
||||
{% endif %}
|
||||
{% if model.multinode_training_script and "multi-node" in model.doc_options %}
|
||||
.. rubric:: Multi-node training
|
||||
|
||||
The following examples use SLURM to run on multiple nodes.
|
||||
|
||||
.. note::
|
||||
|
||||
The following scripts will launch the Docker container and run the
|
||||
benchmark. Run them outside of any Docker container.
|
||||
|
||||
1. Make sure ``$HF_HOME`` is set before running the test. See
|
||||
`ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
|
||||
for more details on downloading the Llama models before running the
|
||||
benchmark.
|
||||
|
||||
2. To run multi-node training for {{ model.model }},
|
||||
use the
|
||||
`multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
|
||||
under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
|
||||
|
||||
3. Run the multi-node training benchmark script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
sbatch -N <num_nodes> {{ model.multinode_training_script }}
|
||||
|
||||
{% else %}
|
||||
.. rubric:: Multi-node training
|
||||
|
||||
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
|
||||
with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`jax-maxtext-history` to find documentation for previous releases
|
||||
of the ``ROCm/jax-training`` Docker image.
|
||||
@@ -16,14 +16,32 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.9 (latest)
|
||||
* - v25.11
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.10
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <primus-megatron-v25.10>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.10>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.9
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* Primus 0.3.0
|
||||
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
-
|
||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||
* :doc:`Primus Megatron documentation <primus-megatron-v25.9>`
|
||||
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.9>`
|
||||
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
||||
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,448 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
****************************************
|
||||
Training a model with Primus and PyTorch
|
||||
****************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm Primus PyTorch training
|
||||
performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
|
||||
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
||||
Primus now supports the PyTorch torchtitan backend.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||
:doc:`ROCm PyTorch training <pytorch-training>` workflow. See
|
||||
:doc:`pytorch-training` to see steps to run workloads without Primus.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
||||
MI300X GPUs containing essential components for Primus and PyTorch training
|
||||
with Primus Turbo optimizations.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-pytorch-model-support-v2510:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. seealso::
|
||||
|
||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||
see the documentation :doc:`pytorch-training` (without Primus)
|
||||
|
||||
.. _amd-primus-pytorch-performance-measurements-v2510:
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ data.docker.pull_tag }}
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
||||
For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
|
||||
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
||||
tweak some configurations (such as batch sizes).
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Primus benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To get started, navigate to the ``Primus`` directory in your container.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd /workspace/Primus
|
||||
|
||||
Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
|
||||
included with Primus with the appropriate options.
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-8b
|
||||
|
||||
Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 4
|
||||
|
||||
|
||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 7
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||
|
||||
Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 4
|
||||
|
||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 3
|
||||
|
||||
.. container:: model-doc primus_pyt_train_deepseek-v2
|
||||
|
||||
Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 16
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 10
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
|
||||
To train DeepSeek V2 16B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 16
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 8
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
|
||||
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -0,0 +1,574 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
****************************************
|
||||
Training a model with Primus and PyTorch
|
||||
****************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm Primus PyTorch training
|
||||
performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
|
||||
|
||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
||||
LLM training framework designed to streamline training. It streamlines LLM
|
||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
||||
Primus now supports the PyTorch torchtitan backend.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
||||
|
||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||
:doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
|
||||
:doc:`../pytorch-training` to see steps to run workloads without Primus.
|
||||
|
||||
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
||||
MI300X GPUs containing essential components for Primus and PyTorch training
|
||||
with Primus Turbo optimizations.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-pytorch-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
||||
Some instructions, commands, and training recommendations in this documentation might
|
||||
vary by model -- select one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. seealso::
|
||||
|
||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||
see the documentation :doc:`../pytorch-training` (without Primus)
|
||||
|
||||
.. _amd-primus-pytorch-performance-measurements-v259:
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Pull the Docker image
|
||||
=====================
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
||||
For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
|
||||
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
||||
tweak some configurations (such as batch sizes).
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, Primus torchtitan models are run with Primus Turbo
|
||||
enabled for enhanced performance. To disable Primus Turbo,
|
||||
modify respective configuration file
|
||||
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Primus benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To get started, navigate to the ``Primus`` directory in your container.
|
||||
|
||||
.. code-block::
|
||||
|
||||
cd /workspace/Primus
|
||||
|
||||
Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
|
||||
included with Primus with the appropriate options.
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-8b
|
||||
|
||||
Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
|
||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 7
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||
|
||||
Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 3
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone torchtitan benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. Navigate to the ``torchtitan`` workspace directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/torchtitan
|
||||
|
||||
.. rubric:: Download the tokenizer
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Download the tokenizer for your model.
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 scripts/download_tokenizer.py \
|
||||
--repo_id {{ model.model_repo }} \
|
||||
--tokenizer_path "original" \
|
||||
--hf_token=${HF_TOKEN}
|
||||
|
||||
.. rubric:: Pretraining examples
|
||||
|
||||
Run the training script with the appropriate configuration file.
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.bf16 }} \
|
||||
.run_train.sh
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.fp8 }} \
|
||||
.run_train.sh
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
|
||||
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -16,14 +16,32 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
||||
- Components
|
||||
- Resources
|
||||
|
||||
* - v25.9 (latest)
|
||||
* - v25.11
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
|
||||
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.10
|
||||
-
|
||||
* ROCm 7.1.0
|
||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||
-
|
||||
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.10>`
|
||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
|
||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||
|
||||
* - v25.9
|
||||
-
|
||||
* ROCm 7.0.0
|
||||
* Primus 0.3.0
|
||||
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||
-
|
||||
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
|
||||
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
|
||||
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
|
||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
|
||||
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
||||
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
||||
|
||||
|
||||
@@ -0,0 +1,669 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch on ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm PyTorch training
|
||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
||||
|
||||
See :doc:`../primus-pytorch` for details.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
The PyTorch for ROCm training Docker image provides a prebuilt optimized
|
||||
environment for fine-tuning and pretraining a model on AMD Instinct MI325X
|
||||
and MI300X GPUs. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support-v2510:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct
|
||||
MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
|
||||
training recommendations in this documentation might vary by model -- select
|
||||
one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _amd-pytorch-training-supported-training-modes-v2510:
|
||||
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. dropdown:: Supported training modes
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- Supported training modes
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if model.training_modes %}
|
||||
* - {{ model.model }}
|
||||
- ``{{ model.training_modes | join('``, ``') }}``
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
Some model and fine-tuning combinations are not listed. This is
|
||||
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||
doesn't provide default YAML configurations for them.
|
||||
For advanced usage, you can create a custom configuration to enable
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v2510:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set docker = data.docker %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`__
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`__
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`__
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`__
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`__
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"pretrain": "Benchmark pre-training.",
|
||||
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
{% if model.mad_tag == "pyt_train_dlrm" %}
|
||||
|
||||
1. Go to the DLRM directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/DLRMBenchmark
|
||||
|
||||
2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
|
||||
run the following script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./launch_training_single_node.sh
|
||||
|
||||
To run with MAD within the Docker container, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -m DLRM
|
||||
|
||||
{% else %}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, FLUX models are not supported out-of-the-box on this Docker.
|
||||
To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"posttrain": "Benchmark post-training.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Post-training
|
||||
|
||||
To start the post-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_mode_descs = {
|
||||
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||
{% if available_modes %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||
.. note::
|
||||
|
||||
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||
use the following torchtune commit for compatibility:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||
|
||||
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||
.. note::
|
||||
|
||||
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||
input tensor should be smaller than max_seq_len (4096)``.
|
||||
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||
|
||||
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||
tokens in this case). You can resolve this by truncating the input or splitting
|
||||
it into smaller chunks before passing it to the model.
|
||||
|
||||
Note on reproducibility: The results in this guide are based on
|
||||
commit ``b4c98ac`` from the upstream
|
||||
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||
latest updates, you can use the main branch.
|
||||
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples-v2510:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
|
||||
|
||||
Pre-training
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch run_slurm_train.sh
|
||||
|
||||
Fine-tuning
|
||||
~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli login # Get access to HF Llama model space
|
||||
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch Torchtune_Multinode.sh
|
||||
|
||||
.. note::
|
||||
|
||||
Information regarding benchmark setup:
|
||||
|
||||
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||
* You can adjust the torchtune `YAML configuration file
|
||||
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||
if you're using a different model.
|
||||
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||
* Set the ``mounting_paths`` inside the SLURM script.
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -240,7 +240,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html>`_
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||
|
||||
@@ -0,0 +1,667 @@
|
||||
:orphan:
|
||||
|
||||
.. meta::
|
||||
:description: How to train a model using PyTorch for ROCm.
|
||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||
|
||||
**************************************
|
||||
Training a model with PyTorch on ROCm
|
||||
**************************************
|
||||
|
||||
.. caution::
|
||||
|
||||
This documentation does not reflect the latest version of ROCm PyTorch training
|
||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||
|
||||
.. note::
|
||||
|
||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
||||
|
||||
See :doc:`../primus-pytorch` for details.
|
||||
|
||||
PyTorch is an open-source machine learning framework that is widely used for
|
||||
model training with GPU-optimized components for transformer-based models.
|
||||
The PyTorch for ROCm training Docker image provides a prebuilt optimized
|
||||
environment for fine-tuning and pretraining a model on AMD Instinct MI325X
|
||||
and MI300X GPUs. It includes the following software components to accelerate
|
||||
training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support-v259:
|
||||
|
||||
Supported models
|
||||
================
|
||||
|
||||
The following models are pre-optimized for performance on the AMD Instinct
|
||||
MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
|
||||
training recommendations in this documentation might vary by model -- select
|
||||
one to get started.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. raw:: html
|
||||
|
||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||
<div class="row gx-0">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row gx-0 pt-1">
|
||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if models|length % 3 == 0 %}
|
||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% else %}
|
||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _amd-pytorch-training-supported-training-modes-v259:
|
||||
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set model_groups = data.model_groups %}
|
||||
.. dropdown:: Supported training modes
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Model
|
||||
- Supported training modes
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% set models = model_group.models %}
|
||||
{% for model in models %}
|
||||
{% if model.training_modes %}
|
||||
* - {{ model.model }}
|
||||
- ``{{ model.training_modes | join('``, ``') }}``
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. note::
|
||||
|
||||
Some model and fine-tuning combinations are not listed. This is
|
||||
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||
doesn't provide default YAML configurations for them.
|
||||
For advanced usage, you can create a custom configuration to enable
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v259:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
|
||||
To evaluate performance, the
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
page provides reference throughput and latency measurements for training
|
||||
popular AI models.
|
||||
|
||||
.. note::
|
||||
|
||||
The performance data presented in
|
||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||
should not be interpreted as the peak performance achievable by AMD
|
||||
Instinct MI325X and MI300X GPUs or ROCm software.
|
||||
|
||||
System validation
|
||||
=================
|
||||
|
||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||
correctly and performing optimally.
|
||||
|
||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||
before starting training.
|
||||
|
||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
This Docker image is optimized for specific model configurations outlined
|
||||
below. Performance can vary for other training workloads, as AMD
|
||||
doesn’t test configurations and run conditions outside those described.
|
||||
|
||||
Run training
|
||||
============
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MAD-integrated benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD
|
||||
pip install -r requirements.txt
|
||||
|
||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||
using one node with the {{ model.precision }} data type on the host machine.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||
madengine run \
|
||||
--tags {{ model.mad_tag }} \
|
||||
--keep-model-dir \
|
||||
--live-output \
|
||||
--timeout 28800
|
||||
|
||||
MAD launches a Docker container with the name
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||
repository and navigate to the benchmark scripts directory
|
||||
``/workspace/MAD/scripts/pytorch_train``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git clone https://github.com/ROCm/MAD
|
||||
cd MAD/scripts/pytorch_train
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_setup.sh
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||
|
||||
* - ``torchdata``
|
||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
|
||||
|
||||
* - ``tomli``
|
||||
- `Tomli <https://pypi.org/project/tomli/>`__
|
||||
|
||||
* - ``tiktoken``
|
||||
- `tiktoken <https://github.com/openai/tiktoken>`__
|
||||
|
||||
* - ``blobfile``
|
||||
- `blobfile <https://pypi.org/project/blobfile/>`__
|
||||
|
||||
* - ``tabulate``
|
||||
- `tabulate <https://pypi.org/project/tabulate/>`__
|
||||
|
||||
* - ``wandb``
|
||||
- `Weights & Biases <https://github.com/wandb/wandb>`__
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
.. container:: model-doc pyt_train_flux
|
||||
|
||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Library
|
||||
- Reference
|
||||
|
||||
* - ``accelerate``
|
||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||
|
||||
* - ``datasets``
|
||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
|
||||
|
||||
* - ``sentencepiece``
|
||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
||||
|
||||
* - ``tensorboard``
|
||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
||||
|
||||
* - ``csvkit``
|
||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
|
||||
|
||||
* - ``deepspeed``
|
||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
|
||||
|
||||
* - ``diffusers``
|
||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
|
||||
|
||||
* - ``GitPython``
|
||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
|
||||
|
||||
* - ``opencv-python-headless``
|
||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
|
||||
|
||||
* - ``peft``
|
||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
|
||||
|
||||
* - ``protobuf``
|
||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
|
||||
|
||||
* - ``pytest``
|
||||
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
|
||||
|
||||
* - ``python-dotenv``
|
||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
|
||||
|
||||
* - ``seaborn``
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
|
||||
|
||||
* - ``transformers``
|
||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
|
||||
|
||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||
|
||||
* `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"pretrain": "Benchmark pre-training.",
|
||||
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pre-training
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
{% if model.mad_tag == "pyt_train_flux" %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, FLUX models are not supported out-of-the-box on this Docker.
|
||||
To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
|
||||
|
||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||
error, manually download it from Hugging Face at
|
||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||
the required dataset.
|
||||
{% endif %}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_modes = model.training_modes %}
|
||||
{% set training_mode_descs = {
|
||||
"posttrain": "Benchmark post-training.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
|
||||
{% if available_modes %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Post-training
|
||||
|
||||
To start the post-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||
- Only Llama 3.1 8B supports FP8 precision.
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
|
||||
{% set training_mode_descs = {
|
||||
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||
} %}
|
||||
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||
{% if available_modes %}
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Fine-tuning
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t $training_mode \
|
||||
-m {{ model.model_repo }} \
|
||||
-p $datatype \
|
||||
-s $sequence_length
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Options
|
||||
- Description
|
||||
|
||||
{% for mode in available_modes %}
|
||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||
- ``{{ mode }}``
|
||||
- {{ training_mode_descs[mode] }}
|
||||
{% endfor %}
|
||||
|
||||
* - ``$datatype``
|
||||
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||
|
||||
* - ``$sequence_length``
|
||||
- Between 2048 and 16384.
|
||||
- Sequence length for the language model.
|
||||
|
||||
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||
.. note::
|
||||
|
||||
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||
use the following torchtune commit for compatibility:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||
|
||||
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||
.. note::
|
||||
|
||||
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||
input tensor should be smaller than max_seq_len (4096)``.
|
||||
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||
|
||||
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||
tokens in this case). You can resolve this by truncating the input or splitting
|
||||
it into smaller chunks before passing it to the model.
|
||||
|
||||
Note on reproducibility: The results in this guide are based on
|
||||
commit ``b4c98ac`` from the upstream
|
||||
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||
latest updates, you can use the main branch.
|
||||
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. rubric:: Benchmarking examples
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples-v259:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
|
||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
|
||||
|
||||
Pre-training
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch run_slurm_train.sh
|
||||
|
||||
Fine-tuning
|
||||
~~~~~~~~~~~
|
||||
|
||||
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||
|
||||
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
huggingface-cli login # Get access to HF Llama model space
|
||||
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||
# In the MAD repository
|
||||
cd scripts/pytorch_train
|
||||
sbatch Torchtune_Multinode.sh
|
||||
|
||||
.. note::
|
||||
|
||||
Information regarding benchmark setup:
|
||||
|
||||
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||
* You can adjust the torchtune `YAML configuration file
|
||||
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||
if you're using a different model.
|
||||
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||
* Set the ``mounting_paths`` inside the SLURM script.
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||
|
||||
- To learn more about system settings and management practices to configure your system for
|
||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||
|
||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||
|
||||
Previous versions
|
||||
=================
|
||||
|
||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||
of the ``ROCm/pytorch-training`` Docker image.
|
||||
@@ -31,12 +31,10 @@ Megatron-LM.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -44,13 +42,12 @@ Megatron-LM.
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-megatron-lm-model-support-v259:
|
||||
.. _amd-primus-megatron-lm-model-support-v25.11:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -111,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||
system's configuration.
|
||||
|
||||
.. _mi300x-amd-primus-megatron-lm-training-v259:
|
||||
.. _mi300x-amd-primus-megatron-lm-training-v25.11:
|
||||
|
||||
Environment setup
|
||||
=================
|
||||
@@ -121,69 +118,55 @@ Environment setup
|
||||
Use the following instructions to set up the environment, configure the script to train models, and
|
||||
reproduce the benchmark results on AMD Instinct GPUs.
|
||||
|
||||
.. _amd-primus-megatron-lm-requirements-v259:
|
||||
.. _amd-primus-megatron-lm-requirements-v25.11:
|
||||
|
||||
Pull the Docker image
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = data.docker %}
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
--shm-size 128G \
|
||||
--name primus_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--device /dev/infiniband \
|
||||
--network host --ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
--shm-size 128G \
|
||||
--name primus_training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
.. code-block:: shell
|
||||
|
||||
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
|
||||
.. code-block:: shell
|
||||
The Docker container hosts verified commit ``c4c083de`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
|
||||
|
||||
docker start primus_training_env
|
||||
docker exec -it primus_training_env bash
|
||||
|
||||
The Docker container hosts verified commit ``e16b27b`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
|
||||
|
||||
.. _amd-primus-megatron-lm-environment-setup-v259:
|
||||
.. _amd-primus-megatron-lm-environment-setup-v25.11:
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
||||
Primus defines a training configuration in YAML for each model in
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/examples/megatron/configs>`__.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
@@ -224,7 +207,7 @@ You can use either mock data or real data for training.
|
||||
|
||||
Ensure that the files are accessible inside the Docker container.
|
||||
|
||||
.. _amd-primus-megatron-lm-tokenizer-v259:
|
||||
.. _amd-primus-megatron-lm-tokenizer-v25.11:
|
||||
|
||||
Tokenizer
|
||||
---------
|
||||
@@ -245,7 +228,7 @@ right permissions to access the tokenizer for each model.
|
||||
<https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||
definition.
|
||||
|
||||
.. _amd-primus-megatron-lm-run-training-v259:
|
||||
.. _amd-primus-megatron-lm-run-training-v25.11:
|
||||
|
||||
Run training
|
||||
============
|
||||
@@ -269,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.3 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.3 70B BF16, run:
|
||||
|
||||
@@ -280,28 +263,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 6 \
|
||||
--global_batch_size 48 \
|
||||
EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 16
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 8B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 8B FP8, run:
|
||||
|
||||
@@ -312,22 +294,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 512 \
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
For Llama 3.1 8B BF16, use the following command:
|
||||
|
||||
@@ -338,26 +319,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 512 \
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 3.1 70B BF16, run:
|
||||
|
||||
@@ -368,20 +350,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 32
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
To run the training on a single node for Llama 3.1 70B FP8, use the following command.
|
||||
|
||||
@@ -398,20 +381,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--micro_batch_size 3 \
|
||||
--global_batch_size 24
|
||||
EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 40 \
|
||||
@@ -422,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 7B FP8, run:
|
||||
|
||||
@@ -433,22 +416,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid \
|
||||
--micro_batch_size 13 \
|
||||
--global_batch_size 416
|
||||
EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
To run pre-training for Llama 2 7B BF16, run:
|
||||
|
||||
@@ -459,26 +441,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 10 \
|
||||
--global_batch_size 640
|
||||
EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run pre-training for Llama 2 70B BF16, run:
|
||||
|
||||
@@ -489,26 +472,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 17 \
|
||||
--global_batch_size 272
|
||||
EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V3.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
|
||||
use the following command:
|
||||
@@ -520,7 +504,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 3 \
|
||||
--moe_layer_freq 1 \
|
||||
@@ -533,7 +517,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--num_layers 3 \
|
||||
--moe_layer_freq 1 \
|
||||
@@ -543,7 +532,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to DeepSeek-V2-Lite.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
|
||||
use the following command:
|
||||
@@ -555,27 +544,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 12 \
|
||||
--global_batch_size 768
|
||||
EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--global_batch_size 256
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||
use the following command:
|
||||
@@ -587,18 +576,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256
|
||||
EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
|
||||
@@ -606,7 +597,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Mixtral 8x22B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
|
||||
use the following command:
|
||||
@@ -618,20 +609,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 4 \
|
||||
--pipeline_model_parallel_size 1 \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 16
|
||||
EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--num_layers 4 \
|
||||
@@ -643,7 +634,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||
command:
|
||||
@@ -655,20 +646,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 16 \
|
||||
--global_batch_size 768
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
For FP8, use the following command.
|
||||
|
||||
@@ -679,28 +671,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
--micro_batch_size 20 \
|
||||
--global_batch_size 800
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--fp8 hybrid
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Qwen 2.5 72B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||
|
||||
@@ -711,7 +702,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50 \
|
||||
--micro_batch_size 16 \
|
||||
@@ -722,11 +713,15 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--train_iters 50
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
.. _amd-primus-megatron-multi-node-examples-v259:
|
||||
EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. _amd-primus-megatron-multi-node-examples-v25.11:
|
||||
|
||||
Multi-node training examples
|
||||
----------------------------
|
||||
@@ -740,28 +735,27 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
{% set docker = data.docker %}
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
|
||||
cd Primus
|
||||
git checkout c4c083de64ba3e8f19ccc9629411267108931f9e
|
||||
git submodule update --init --recursive
|
||||
|
||||
.. code-block:: shell
|
||||
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||
export HF_TOKEN=<your_HF_token>
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||
|
||||
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
|
||||
cd Primus
|
||||
git checkout e16b27b
|
||||
|
||||
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||
export HF_TOKEN=<your_HF_token>
|
||||
export HSA_NO_SCRATCH_RECLAIM=1
|
||||
export NVTE_CK_USES_BWD_V3=1
|
||||
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||
{% endfor %}
|
||||
# Set the variables for better performance
|
||||
# only on MI325X and MI300X
|
||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -769,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||
* To find your network interface, you can use ``ip a``.
|
||||
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
|
||||
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate.
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 8B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -784,16 +778,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--global_batch_size 1024 \
|
||||
--fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 7B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Llama 2 7B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -802,16 +795,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--global_batch_size 2048 \
|
||||
--fp8 hybrid
|
||||
|
||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.1 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -820,20 +812,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
@@ -843,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -852,20 +842,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama2_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 10 \
|
||||
--global_batch_size 640 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
|
||||
bash ./examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 1536 \
|
||||
@@ -875,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 3.3 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -884,20 +872,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama3.3_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 4 \
|
||||
--global_batch_size 256 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 1 \
|
||||
--global_batch_size 256 \
|
||||
@@ -907,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||
|
||||
@@ -916,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 2 \
|
||||
--global_batch_size 256
|
||||
@@ -925,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
|
||||
Once setup is complete, run the appropriate training command.
|
||||
The following run commands are tailored to Llama 2 70B.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||
|
||||
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||
|
||||
@@ -934,15 +920,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
||||
# Adjust the training parameters.
|
||||
# For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||
NNODES=8 \
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||
EXP=examples/megatron/configs/qwen2.5_72B-FP8-pretrain.yaml \
|
||||
bash examples/run_slurm_pretrain.sh \
|
||||
--micro_batch_size 8 \
|
||||
--global_batch_size 512 \
|
||||
--recompute_num_layers 80 \
|
||||
--no_fp8_weight_transpose_cache true \
|
||||
--fp8 hybrid
|
||||
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
|
||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11:
|
||||
|
||||
Key options
|
||||
-----------
|
||||
@@ -987,7 +971,10 @@ num_layers
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error
|
||||
due to a memory free issue. However, this does not impacts training runs. All
|
||||
iterations, in this case 50, should have been completed before the exit and
|
||||
the results should be available in the end.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
@@ -29,12 +29,10 @@ with Primus Turbo optimizations.
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -42,13 +40,12 @@ with Primus Turbo optimizations.
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-primus-pytorch-model-support-v259:
|
||||
.. _amd-primus-pytorch-model-support-v25.11:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -67,7 +64,7 @@ vary by model -- select one to get started.
|
||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||
<div class="row col-10 pe-0">
|
||||
{% for model_group in model_groups %}
|
||||
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
@@ -94,7 +91,7 @@ vary by model -- select one to get started.
|
||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||
see the documentation :doc:`pytorch-training` (without Primus)
|
||||
|
||||
.. _amd-primus-pytorch-performance-measurements-v259:
|
||||
.. _amd-primus-pytorch-performance-measurements-v25.11:
|
||||
|
||||
System validation
|
||||
=================
|
||||
@@ -120,20 +117,11 @@ Pull the Docker image
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
|
||||
Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker pull {{ data.docker.pull_tag }}
|
||||
|
||||
Run training
|
||||
============
|
||||
@@ -145,7 +133,7 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = data.docker %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
.. tab-set::
|
||||
@@ -158,7 +146,7 @@ tweak some configurations (such as batch sizes).
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -185,13 +173,6 @@ tweak some configurations (such as batch sizes).
|
||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||
model are collected in ``~/MAD/perf.csv``.
|
||||
|
||||
.. note::
|
||||
|
||||
Currently, Primus torchtitan models are run with Primus Turbo
|
||||
enabled for enhanced performance. To disable Primus Turbo,
|
||||
modify respective configuration file
|
||||
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
@@ -203,48 +184,34 @@ tweak some configurations (such as batch sizes).
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
@@ -253,6 +220,9 @@ tweak some configurations (such as batch sizes).
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
The Docker container hosts verified commit ``c4c083de`` of the `Primus
|
||||
<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
|
||||
|
||||
.. rubric:: Prepare training datasets and dependencies
|
||||
|
||||
The following benchmarking examples require downloading models and datasets
|
||||
@@ -283,75 +253,56 @@ tweak some configurations (such as batch sizes).
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X and MI300X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 7
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 7
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||
|
||||
@@ -364,36 +315,57 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 8
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 4
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
:sync: MI355X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. container:: model-doc primus_pyt_train_deepseek-v3-16b
|
||||
|
||||
Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
.. tab-item:: MI355X and MI350X
|
||||
@@ -401,151 +373,27 @@ tweak some configurations (such as batch sizes).
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 6
|
||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
|
||||
.. tab-item:: MI325X
|
||||
:sync: MI325X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 5
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh --training.local_batch_size 10
|
||||
|
||||
.. tab-item:: MI300X
|
||||
:sync: MI325X and MI300X
|
||||
:sync: MI300X
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh \
|
||||
--metrics.enable_tensorboard false \
|
||||
--profiling.enable_profiling false \
|
||||
--training.batch_size 3
|
||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||
bash examples/run_pretrain.sh
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. tab-item:: Standalone torchtitan benchmarking
|
||||
|
||||
{% for model_group in model_groups %}
|
||||
{% for model in model_group.models %}
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
|
||||
|
||||
.. rubric:: Download the Docker image and required packages
|
||||
|
||||
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
2. Run the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker start training_env
|
||||
docker exec -it training_env bash
|
||||
|
||||
3. Navigate to the ``torchtitan`` workspace directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/torchtitan
|
||||
|
||||
.. rubric:: Download the tokenizer
|
||||
|
||||
1. The following benchmarking examples require downloading models and datasets
|
||||
from Hugging Face. To ensure successful access to gated repos, set your
|
||||
``HF_TOKEN``.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||
|
||||
2. Download the tokenizer for your model.
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
python3 scripts/download_tokenizer.py \
|
||||
--repo_id {{ model.model_repo }} \
|
||||
--tokenizer_path "original" \
|
||||
--hf_token=${HF_TOKEN}
|
||||
|
||||
.. rubric:: Pretraining examples
|
||||
|
||||
Run the training script with the appropriate configuration file.
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.bf16 }} \
|
||||
.run_train.sh
|
||||
|
||||
For train with BF16 precicion, use the following command:
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
CONFIG_FILE={{ model.config_file.fp8 }} \
|
||||
.run_train.sh
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -27,12 +27,10 @@ training workloads:
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
.. tab-set::
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
.. tab-item:: {{ data.docker.pull_tag }}
|
||||
:sync: {{ data.docker.pull_tag }}
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -40,13 +38,12 @@ training workloads:
|
||||
* - Software component
|
||||
- Version
|
||||
|
||||
{% for component_name, component_version in docker.components.items() %}
|
||||
{% for component_name, component_version in data.docker.components.items() %}
|
||||
* - {{ component_name }}
|
||||
- {{ component_version }}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
.. _amd-pytorch-training-model-support-v259:
|
||||
.. _amd-pytorch-training-model-support-v25.11:
|
||||
|
||||
Supported models
|
||||
================
|
||||
@@ -88,7 +85,7 @@ one to get started.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
.. _amd-pytorch-training-supported-training-modes-v259:
|
||||
.. _amd-pytorch-training-supported-training-modes-v25.11:
|
||||
|
||||
The following table lists supported training modes per model.
|
||||
|
||||
@@ -123,7 +120,7 @@ The following table lists supported training modes per model.
|
||||
unlisted fine-tuning methods by using an existing file in the
|
||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||
|
||||
.. _amd-pytorch-training-performance-measurements-v259:
|
||||
.. _amd-pytorch-training-performance-measurements-v25.11:
|
||||
|
||||
Performance measurements
|
||||
========================
|
||||
@@ -164,7 +161,7 @@ Run training
|
||||
|
||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||
|
||||
{% set dockers = data.dockers %}
|
||||
{% set docker = data.docker %}
|
||||
{% set model_groups = data.model_groups %}
|
||||
|
||||
Once the setup is complete, choose between two options to start benchmarking training:
|
||||
@@ -179,7 +176,7 @@ Run training
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following run command is tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
||||
|
||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||
directory and install the required packages on the host machine.
|
||||
@@ -217,7 +214,7 @@ Run training
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
The following commands are tailored to {{ model.model }}.
|
||||
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
|
||||
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
||||
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
@@ -226,42 +223,28 @@ Run training
|
||||
|
||||
1. Use the following command to pull the Docker image from Docker Hub.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker pull {{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker pull {{ docker.pull_tag }}
|
||||
|
||||
2. Launch the Docker container.
|
||||
|
||||
.. tab-set::
|
||||
.. code-block:: shell
|
||||
|
||||
{% for supported_gpus, docker in dockers.items() %}
|
||||
.. tab-item:: {{ supported_gpus }}
|
||||
:sync: {{ supported_gpus }}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
{% endfor %}
|
||||
docker run -it \
|
||||
--device /dev/dri \
|
||||
--device /dev/kfd \
|
||||
--network host \
|
||||
--ipc host \
|
||||
--group-add video \
|
||||
--cap-add SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--privileged \
|
||||
-v $HOME:$HOME \
|
||||
-v $HOME/.ssh:/root/.ssh \
|
||||
--shm-size 64G \
|
||||
--name training_env \
|
||||
{{ docker.pull_tag }}
|
||||
|
||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||
|
||||
@@ -419,11 +402,34 @@ Run training
|
||||
|
||||
.. container:: model-doc {{ model.mad_tag }}
|
||||
|
||||
.. rubric:: Pre-training
|
||||
.. rubric:: Pretraining
|
||||
|
||||
To start the pre-training benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
|
||||
{% if model.mad_tag == "pyt_train_dlrm" %}
|
||||
|
||||
1. Go to the DLRM directory.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cd /workspace/DLRMBenchmark
|
||||
|
||||
2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
|
||||
run the following script.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./launch_training_single_node.sh
|
||||
|
||||
To run with MAD within the Docker container, use the following command.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t pretrain -m DLRM
|
||||
|
||||
{% else %}
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||
@@ -466,6 +472,7 @@ Run training
|
||||
* - ``$sequence_length``
|
||||
- Sequence length for the language model.
|
||||
- Between 2048 and 8192. 8192 by default.
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% set training_modes = model.training_modes %}
|
||||
@@ -525,7 +532,7 @@ Run training
|
||||
|
||||
To start the fine-tuning benchmark, use the following command with the
|
||||
appropriate options. See the following list of options and their descriptions.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
|
||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
@@ -590,7 +597,7 @@ Run training
|
||||
|
||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||
|
||||
.. _amd-pytorch-training-multinode-examples-v259:
|
||||
.. _amd-pytorch-training-multinode-examples-v25.11:
|
||||
|
||||
Multi-node training
|
||||
-------------------
|
||||
@@ -639,11 +646,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi
|
||||
|
||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||
|
||||
Known issues
|
||||
============
|
||||
|
||||
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
|
||||
|
||||
Further reading
|
||||
===============
|
||||
|
||||
|
||||
@@ -43,8 +43,6 @@ subtrees:
|
||||
title: DGL compatibility
|
||||
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||
title: Megablocks compatibility
|
||||
- file: compatibility/ml-compatibility/taichi-compatibility.rst
|
||||
title: Taichi compatibility
|
||||
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
||||
title: Ray compatibility
|
||||
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
||||
@@ -77,8 +75,14 @@ subtrees:
|
||||
- entries:
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||
title: Train a model with Primus and Megatron-LM
|
||||
entries:
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
||||
title: Train a model with Megatron-LM
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
|
||||
title: Train a model with Primus and PyTorch
|
||||
entries:
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||
title: Train a model with PyTorch
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||
title: Train a model with JAX MaxText
|
||||
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
|
||||
@@ -117,6 +121,8 @@ subtrees:
|
||||
title: SGLang inference performance testing
|
||||
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
|
||||
title: SGLang distributed inference with Mooncake
|
||||
- file: how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
|
||||
title: xDiT diffusion inference
|
||||
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
|
||||
title: Deploy your model
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
rocm-docs-core==1.29.0
|
||||
rocm-docs-core==1.31.1
|
||||
sphinx-reredirects
|
||||
sphinx-sitemap
|
||||
sphinxcontrib.datatemplates==0.11.0
|
||||
|
||||
@@ -132,6 +132,7 @@ nest-asyncio==1.6.0
|
||||
packaging==25.0
|
||||
# via
|
||||
# ipykernel
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
parso==0.8.5
|
||||
# via jedi
|
||||
@@ -149,7 +150,7 @@ pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pycparser==2.23
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.16.1
|
||||
pydata-sphinx-theme==0.15.4
|
||||
# via
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
@@ -163,7 +164,7 @@ pygments==2.19.2
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.10.1
|
||||
# via pygithub
|
||||
pynacl==1.6.1
|
||||
pynacl==1.6.2
|
||||
# via pygithub
|
||||
python-dateutil==2.9.0.post0
|
||||
# via jupyter-client
|
||||
@@ -187,7 +188,7 @@ requests==2.32.5
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.29.0
|
||||
rocm-docs-core==1.31.1
|
||||
# via -r requirements.in
|
||||
rpds-py==0.29.0
|
||||
# via
|
||||
@@ -217,7 +218,7 @@ sphinx==8.1.3
|
||||
# sphinx-reredirects
|
||||
# sphinxcontrib-datatemplates
|
||||
# sphinxcontrib-runcmd
|
||||
sphinx-book-theme==1.1.3
|
||||
sphinx-book-theme==1.1.4
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.2
|
||||
# via rocm-docs-core
|
||||
@@ -281,7 +282,7 @@ typing-extensions==4.15.0
|
||||
# pygithub
|
||||
# referencing
|
||||
# sqlalchemy
|
||||
urllib3==2.5.0
|
||||
urllib3==2.6.3
|
||||
# via
|
||||
# pygithub
|
||||
# requests
|
||||
|
||||
@@ -123,8 +123,7 @@ Performance
|
||||
|
||||
.. note::
|
||||
|
||||
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`.
|
||||
Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
||||
`ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is a tool for visualizing and analyzing GPU thread trace data collected using :doc:`rocprofv3 <rocprofiler-sdk:index>`. Note that `ROCprof Compute Viewer <https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/amd-mainline/>`_ is in an early access state. Running production workloads is not recommended.
|
||||
|
||||
Development
|
||||
^^^^^^^^^^^
|
||||
|
||||
Reference in New Issue
Block a user