diff --git a/.azuredevops/components/hipSPARSELt.yml b/.azuredevops/components/hipSPARSELt.yml
index 02e258f78..9f918db62 100644
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -40,6 +40,7 @@ parameters:
- gfortran
- libgfortran5
- libopenblas-dev
+ - liblapack-dev
- name: pipModules
type: object
default:
@@ -125,10 +126,13 @@ jobs:
aggregatePipeline: ${{ parameters.aggregatePipeline }}
${{ if parameters.triggerDownstreamJobs }}:
downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+ # NOTE: content between `---` is for transition support between old/new build systems
+ # and should be removed once transition is complete.
+ # -----------------------------
# Build and install gtest and lapack
# $(Pipeline.Workspace)/deps is a temporary folder for the build process
# $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
- - script: mkdir $(Pipeline.Workspace)/deps
+ - script: mkdir -p $(Pipeline.Workspace)/deps
displayName: Create temp folder for external dependencies
# hipSPARSELt already has a CMake script for external deps, so we can just run that
# https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
@@ -144,22 +148,35 @@ jobs:
- script: sudo make install
displayName: Install hipSPARSELt external dependencies
workingDirectory: $(Pipeline.Workspace)/deps
+ # -----------------------------
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
parameters:
os: ${{ job.os }}
+ # NOTE: the following options are old build only
+ # and can be removed after full transition to new build
+ # -DAMDGPU_TARGETS=${{ job.target }}
+ # -DCMAKE_Fortran_COMPILER=f95
+ # -DTensile_LOGIC=
+ # -DTensile_CPU_THREADS=
+ # -DTensile_LIBRARY_FORMAT=msgpack
+ # -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+ # -DBUILD_CLIENTS_TESTS=ON
+ # -DBUILD_USE_LOCAL_TENSILE=OFF
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
- -DCMAKE_Fortran_COMPILER=f95
+ -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
+ -DGPU_TARGETS=${{ job.target }}
-DAMDGPU_TARGETS=${{ job.target }}
+ -DCMAKE_Fortran_COMPILER=f95
-DTensile_LOGIC=
-DTensile_CPU_THREADS=
-DTensile_LIBRARY_FORMAT=msgpack
- -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-DROCM_PATH=$(Agent.BuildDirectory)/rocm
-DBUILD_CLIENTS_TESTS=ON
-DBUILD_USE_LOCAL_TENSILE=OFF
+ -DHIPSPARSELT_ENABLE_FETCH=ON
-GNinja
${{ if ne(parameters.sparseCheckoutDir, '') }}:
cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
diff --git a/.wordlist.txt b/.wordlist.txt
index c9b592658..982320da9 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -72,6 +72,7 @@ CU
CUDA
CUs
CXX
+CX
Cavium
CentOS
ChatGPT
@@ -118,6 +119,8 @@ Dependabot
Deprecations
DevCap
DirectX
+Disaggregated
+disaggregated
Dockerfile
Dockerized
Doxygen
@@ -127,6 +130,7 @@ ENDPGM
EPYC
ESXi
EoS
+etcd
fas
FBGEMM
FIFOs
@@ -178,6 +182,7 @@ GPUs
Graphbolt
GraphSage
GRBM
+GRE
GenAI
GenZ
GitHub
@@ -301,6 +306,7 @@ MirroredStrategy
Mixtral
MosaicML
MoEs
+Mooncake
Mpops
Multicore
Multithreaded
@@ -445,6 +451,7 @@ SKU
SKUs
SLES
SLURM
+Slurm
SMEM
SMFMA
SMI
@@ -615,6 +622,7 @@ coalescable
codename
collater
comgr
+compat
completers
composable
concretization
@@ -776,6 +784,7 @@ lossy
macOS
matchers
maxtext
+megablocks
megatron
microarchitecture
migraphx
@@ -934,6 +943,7 @@ softmax
spack
spmm
src
+stanford
stochastically
strided
subcommand
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cab6aca80..09d290c16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -764,7 +764,7 @@ HIP runtime has the following functional improvements which improves runtime per
#### Changed
-* VX_RPP extension : Version 3.1.0 release.
+* VX_RPP extension: Version 3.1.0 release.
* Update the parameters and kernel API of Blur, Fog, Jitter, LensCorrection, Rain, Pixelate, Vignette and ResizeCrop wrt tensor kernels replacing the legacy BatchPD API calls in VX_RPP extensions.
#### Known issues
@@ -1144,7 +1144,7 @@ Review the [README](https://github.com/ROCm/rocm_bandwidth_test/blob/amd-mainlin
#### Changed
* Completed migration from legacy [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/) to [ROCprofiler-SDK](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/).
-* Reorganized the configuration files internally and improved [README/installation](https://github.com/ROCm/rdc/blob/amd-staging/README.md) instructions.
+* Reorganized the configuration files internally and improved [README/installation](https://github.com/ROCm/rdc/blob/release/rocm-rel-7.0/README.md) instructions.
* Updated metrics and monitoring support for the latest AMD data center GPUs.
#### Optimized
@@ -1346,7 +1346,7 @@ The previous default accumulator types could lead to situations in which unexpec
- ROCprof Trace Decoder as experimental API:
- Requires [ROCprof Trace Decoder plugin](https://github.com/rocm/rocprof-trace-decoder).
- Thread trace option in the `rocprofv3` tool under the `--att` parameters:
- - See [using thread trace with rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/amd-mainline/how-to/using-thread-trace.html)
+ - See [using thread trace with rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-7.0.0/how-to/using-thread-trace.html)
- Requires [ROCprof Trace Decoder plugin](https://github.com/rocm/rocprof-trace-decoder).
- `rocpd` output format documentation:
- Requires [ROCprof Trace Decoder plugin](https://github.com/rocm/rocprof-trace-decoder).
diff --git a/RELEASE.md b/RELEASE.md
index 5b0ede98c..7f9ec1c23 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -461,8 +461,9 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
benchmarking guides have been updated with expanded model coverage and
optimized Docker environments. Highlights include:
- * The [Training a model with Primus and Megatron](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.html) benchmarking guide
- now leverages the unified AMD Primus framework with the Megatron backend. See [Primus: A Lightweight, Unified Training Framework for Large Models on AMD
+ * The [Training a model with Primus and Megatron](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.html)
+ and [Training a model with Primus and PyTorch](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.html) benchmarking guides
+ now leverage the unified AMD Primus framework with the Megatron and torchtitan backends. See [Primus: A Lightweight, Unified Training Framework for Large Models on AMD
GPUs](https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html) for an introduction to Primus.
* The [Training a model with PyTorch](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html) benchmarking guide
@@ -471,6 +472,9 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
* The [Training a model with JAX MaxText](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html) benchmarking guide
now supports [MAD](https://github.com/ROCm/MAD)-integrated benchmarking. The MaxText training environment now uses JAX 0.6.0 or 0.5.0. FP8 quantized training is supported with JAX 0.5.0.
+ * The [SGLang distributed inference](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.html?model=llama-3.1-8b-instruct) guide
+ provides a recipe to get started with disaggregated prefill/decode inference.
+
* The [vLLM inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/benchmark-docker/vllm.html) documentation
now features clearer serving and throughput benchmarking commands -- for improved transparency of model benchmarking configurations. The vLLM inference
environment now uses vLLM 0.10.1 and includes improved default configurations.
@@ -534,17 +538,16 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
### User space, driver, and firmware dependent changes
-The software for AMD Datacenter GPU products requires maintaining a hardware
-and software stack with interdependencies between the GPU and baseboard
-firmware, AMD GPU drivers, and the ROCm user space software.
-
+Running GPU software on AMD data center GPUs requires maintaining a coordinated
+hardware and software stack. This stack has interdependencies between the GPU
+and baseboard firmware, AMD GPU drivers, and the ROCm user-space software.
As of the ROCm 7.0.0 release, these interdependencies are publicly documented.
-Note that while AMD publishes drivers and ROCm user space, your server or
+While AMD publishes drivers and ROCm user space components, your server or
infrastructure provider publishes the GPU and baseboard firmware by bundling
AMD’s firmware releases via AMD’s Platform Level Data Model (PLDM) bundle,
which includes Integrated Firmware Image (IFWI).
-GPU and baseboard firmware versioning might differ across GPU families. With the
+GPU and baseboard firmware versioning might differ across GPU families. Note that with the
ROCm 7.0.0 release, the AMD GPU driver (amdgpu) is now versioned separately
from ROCm. See [AMD GPU Driver/ROCm packaging separation](#amd-gpu-driver-rocm-packaging-separation).
diff --git a/docs/about/license.md b/docs/about/license.md
index f4e4b8776..8c2c36f22 100644
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -29,7 +29,7 @@ additional licenses. Please review individual repositories for more information.
| [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
| [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
| [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
+| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
| [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
| [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
| [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
diff --git a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
index 037fb9a89..8dfededdb 100644
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -72,7 +72,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.18.1 `__
+ - `tensorflow-rocm 2.18.1 `__
- 24.04
- `Python 3.12 `__
- `TensorBoard 2.18.0 `__
@@ -81,7 +81,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.18.1 `__
+ - `tensorflow-rocm 2.18.1 `__
- 22.04
- `Python 3.10 `__
- `TensorBoard 2.18.0 `__
@@ -90,7 +90,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.17.1 `__
+ - `tensorflow-rocm 2.17.1 `__
- 24.04
- `Python 3.12 `__
- `TensorBoard 2.17.1 `__
@@ -99,7 +99,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.17.1 `__
+ - `tensorflow-rocm 2.17.1 `__
- 22.04
- `Python 3.10 `__
- `TensorBoard 2.17.1 `__
@@ -108,7 +108,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.16.2 `__
+ - `tensorflow-rocm 2.16.2 `__
- 24.04
- `Python 3.12 `__
- `TensorBoard 2.16.2 `__
@@ -117,7 +117,7 @@ the |docker-icon| icon to view the image on Docker Hub.
rocm/tensorflow
- - `tensorflow-rocm 2.16.2 `__
+ - `tensorflow-rocm 2.16.2 `__
- 22.04
- `Python 3.10 `__
- `TensorBoard 2.16.2 `__
diff --git a/docs/conf.py b/docs/conf.py
index 3b25cf9d6..fcc679678 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -135,9 +135,13 @@ article_pages = [
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
@@ -162,6 +166,8 @@ article_pages = [
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
+ {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
diff --git a/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
new file mode 100644
index 000000000..b0ba6a549
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
@@ -0,0 +1,32 @@
+dockers:
+ - pull_tag: lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
+ docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729
+ components:
+ ROCm: 7.0.0
+ SGLang: v0.5.2rc1
+ pytorch-triton-rocm: 3.4.0+rocm7.0.0.gitf9e5bf54
+model_groups:
+ - group: Dense models
+ tag: dense-models
+ models:
+ - model: Llama 3.1 8B Instruct
+ model_repo: Llama-3.1-8B-Instruct
+ url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+ - model: Llama 3.1 405B FP8 KV
+ model_repo: Llama-3.1-405B-Instruct-FP8-KV
+ url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+ - model: Llama 3.3 70B FP8 KV
+ model_repo: amd-Llama-3.3-70B-Instruct-FP8-KV
+ url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
+ - model: Qwen3 32B
+ model_repo: Qwen3-32B
+ url: https://huggingface.co/Qwen/Qwen3-32B
+ - group: Small experts models
+ tag: small-experts-models
+ models:
+ - model: DeepSeek V3
+ model_repo: DeepSeek-V3
+ url: https://huggingface.co/deepseek-ai/DeepSeek-V3
+ - model: Mixtral 8x7B v0.1
+ model_repo: Mixtral-8x7B-v0.1
+ url: https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
new file mode 100644
index 000000000..dc19843be
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
@@ -0,0 +1,162 @@
+dockers:
+ - pull_tag: rocm/pytorch-training:v25.7
+ docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+ components:
+ ROCm: 6.4.2
+ PyTorch: 2.8.0a0+gitd06a406
+ Python: 3.10.18
+ Transformer Engine: 2.2.0.dev0+94e53dd8
+ Flash Attention: 3.0.0.post1
+ hipBLASLt: 1.1.0-4b9a52edfc
+ Triton: 3.3.0
+model_groups:
+ - group: Meta Llama
+ tag: llama
+ models:
+ - model: Llama 4 Scout 17B-16E
+ mad_tag: pyt_train_llama-4-scout-17b-16e
+ model_repo: Llama-4-17B_16E
+ url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 3.3 70B
+ mad_tag: pyt_train_llama-3.3-70b
+ model_repo: Llama-3.3-70B
+ url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+ - model: Llama 3.2 1B
+ mad_tag: pyt_train_llama-3.2-1b
+ model_repo: Llama-3.2-1B
+ url: https://huggingface.co/meta-llama/Llama-3.2-1B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 3.2 3B
+ mad_tag: pyt_train_llama-3.2-3b
+ model_repo: Llama-3.2-3B
+ url: https://huggingface.co/meta-llama/Llama-3.2-3B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 3.2 Vision 11B
+ mad_tag: pyt_train_llama-3.2-vision-11b
+ model_repo: Llama-3.2-Vision-11B
+ url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+ precision: BF16
+ training_modes: [finetune_fw]
+ - model: Llama 3.2 Vision 90B
+ mad_tag: pyt_train_llama-3.2-vision-90b
+ model_repo: Llama-3.2-Vision-90B
+ url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+ precision: BF16
+ training_modes: [finetune_fw]
+ - model: Llama 3.1 8B
+ mad_tag: pyt_train_llama-3.1-8b
+ model_repo: Llama-3.1-8B
+ url: https://huggingface.co/meta-llama/Llama-3.1-8B
+ precision: BF16
+ training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+ - model: Llama 3.1 70B
+ mad_tag: pyt_train_llama-3.1-70b
+ model_repo: Llama-3.1-70B
+ url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+ precision: BF16
+ training_modes: [pretrain, finetune_fw, finetune_lora]
+ - model: Llama 3.1 405B
+ mad_tag: pyt_train_llama-3.1-405b
+ model_repo: Llama-3.1-405B
+ url: https://huggingface.co/meta-llama/Llama-3.1-405B
+ precision: BF16
+ training_modes: [finetune_qlora]
+ - model: Llama 3 8B
+ mad_tag: pyt_train_llama-3-8b
+ model_repo: Llama-3-8B
+ url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 3 70B
+ mad_tag: pyt_train_llama-3-70b
+ model_repo: Llama-3-70B
+ url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 2 7B
+ mad_tag: pyt_train_llama-2-7b
+ model_repo: Llama-2-7B
+ url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+ - model: Llama 2 13B
+ mad_tag: pyt_train_llama-2-13b
+ model_repo: Llama-2-13B
+ url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Llama 2 70B
+ mad_tag: pyt_train_llama-2-70b
+ model_repo: Llama-2-70B
+ url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+ precision: BF16
+ training_modes: [finetune_lora, finetune_qlora]
+ - group: OpenAI
+ tag: openai
+ models:
+ - model: GPT OSS 20B
+ mad_tag: pyt_train_gpt_oss_20b
+ model_repo: GPT-OSS-20B
+ url: https://huggingface.co/openai/gpt-oss-20b
+ precision: BF16
+ training_modes: [HF_finetune_lora]
+ - model: GPT OSS 120B
+ mad_tag: pyt_train_gpt_oss_120b
+ model_repo: GPT-OSS-120B
+ url: https://huggingface.co/openai/gpt-oss-120b
+ precision: BF16
+ training_modes: [HF_finetune_lora]
+ - group: Qwen
+ tag: qwen
+ models:
+ - model: Qwen 3 8B
+ mad_tag: pyt_train_qwen3-8b
+ model_repo: Qwen3-8B
+ url: https://huggingface.co/Qwen/Qwen3-8B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Qwen 3 32B
+ mad_tag: pyt_train_qwen3-32b
+ model_repo: Qwen3-32
+ url: https://huggingface.co/Qwen/Qwen3-32B
+ precision: BF16
+ training_modes: [finetune_lora]
+ - model: Qwen 2.5 32B
+ mad_tag: pyt_train_qwen2.5-32b
+ model_repo: Qwen2.5-32B
+ url: https://huggingface.co/Qwen/Qwen2.5-32B
+ precision: BF16
+ training_modes: [finetune_lora]
+ - model: Qwen 2.5 72B
+ mad_tag: pyt_train_qwen2.5-72b
+ model_repo: Qwen2.5-72B
+ url: https://huggingface.co/Qwen/Qwen2.5-72B
+ precision: BF16
+ training_modes: [finetune_lora]
+ - model: Qwen 2 1.5B
+ mad_tag: pyt_train_qwen2-1.5b
+ model_repo: Qwen2-1.5B
+ url: https://huggingface.co/Qwen/Qwen2-1.5B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - model: Qwen 2 7B
+ mad_tag: pyt_train_qwen2-7b
+ model_repo: Qwen2-7B
+ url: https://huggingface.co/Qwen/Qwen2-7B
+ precision: BF16
+ training_modes: [finetune_fw, finetune_lora]
+ - group: Flux
+ tag: flux
+ models:
+ - model: FLUX.1-dev
+ mad_tag: pyt_train_flux
+ model_repo: Flux
+ url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+ precision: BF16
+ training_modes: [pretrain]
diff --git a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
new file mode 100644
index 000000000..59cf25471
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -0,0 +1,24 @@
+dockers:
+ - pull_tag: rocm/pytorch-training:v25.8
+ docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
+ components:
+ ROCm: 6.4.3
+ PyTorch: 2.8.0a0+gitd06a406
+ Python: 3.10.18
+ Transformer Engine: 2.2.0.dev0+a1e66aae
+ Flash Attention: 3.0.0.post1
+ hipBLASLt: 1.1.0-d1b517fc7a
+model_groups:
+ - group: Meta Llama
+ tag: llama
+ models:
+ - model: Llama 3.1 8B
+ mad_tag: primus_pyt_train_llama-3.1-8b
+ model_repo: Llama-3.1-8B
+ url: https://huggingface.co/meta-llama/Llama-3.1-8B
+ precision: BF16
+ - model: Llama 3.1 70B
+ mad_tag: primus_pyt_train_llama-3.1-70b
+ model_repo: Llama-3.1-70B
+ url: https://huggingface.co/meta-llama/Llama-3.1-70B
+ precision: BF16
diff --git a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
index dc19843be..ba45d16ee 100644
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,14 +1,13 @@
dockers:
- - pull_tag: rocm/pytorch-training:v25.7
- docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+ - pull_tag: rocm/pytorch-training:v25.8
+ docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
components:
- ROCm: 6.4.2
+ ROCm: 6.4.3
PyTorch: 2.8.0a0+gitd06a406
Python: 3.10.18
- Transformer Engine: 2.2.0.dev0+94e53dd8
+ Transformer Engine: 2.2.0.dev0+a1e66aae
Flash Attention: 3.0.0.post1
- hipBLASLt: 1.1.0-4b9a52edfc
- Triton: 3.3.0
+ hipBLASLt: 1.1.0-d1b517fc7a
model_groups:
- group: Meta Llama
tag: llama
@@ -151,6 +150,15 @@ model_groups:
url: https://huggingface.co/Qwen/Qwen2-7B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
+ - group: Stable Diffusion
+ tag: sd
+ models:
+ - model: Stable Diffusion XL
+ mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
+ model_repo: SDXL
+ url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+ precision: BF16
+ training_modes: [finetune_lora]
- group: Flux
tag: flux
models:
@@ -160,3 +168,11 @@ model_groups:
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [pretrain]
+ - group: NCF
+ tag: ncf
+ models:
+ - model: NCF
+ mad_tag: pyt_ncf_training
+ model_repo:
+ url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
+ precision: FP32
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
index 857a1ee0b..2fbd21002 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,103 +16,112 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub `
+ * `Docker Hub `__
+
+ * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
+ -
* ROCm 6.4.1
* vLLM 0.10.0
* PyTorch 2.7.0
- -
- * :doc:`Documentation <../vllm>`
+ -
+ * :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
- -
+ -
* ROCm 6.4.1
* vLLM 0.9.1
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
- -
+ -
* ROCm 6.4.1
* vLLM 0.9.1
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
- -
+ -
* ROCm 6.4.1
* vLLM 0.9.0.1
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
- -
+ -
* ROCm 6.3.1
* 0.8.5 vLLM (0.8.6.dev)
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
- -
+ -
* ROCm 6.3.1
* vLLM 0.8.5
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
- -
+ -
* ROCm 6.3.1
* vLLM 0.8.3
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
- -
+ -
* ROCm 6.3.1
* vLLM 0.7.3
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
- -
+ -
* ROCm 6.3.1
* vLLM 0.6.6
* PyTorch 2.7.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
- -
+ -
* ROCm 6.2.1
* vLLM 0.6.4
* PyTorch 2.5.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
* - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
- -
+ -
* ROCm 6.2.0
* vLLM 0.4.3
* PyTorch 2.4.0
- -
+ -
* :doc:`Documentation `
* `Docker Hub `__
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
new file mode 100644
index 000000000..723dedb03
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
@@ -0,0 +1,257 @@
+.. meta::
+ :description: SGLang multi-node disaggregated distributed inference using Mooncake
+ :keywords: model, sglang, mooncake, disagg, disaggregated, distributed, multi-node, docker
+
+******************************************
+SGLang distributed inference with Mooncake
+******************************************
+
+As LLM inference increasingly demands handling massive models and dynamic workloads, efficient
+distributed inference becomes essential. Traditional co-located architectures face bottlenecks due
+to tightly coupled memory and compute resources, which limits scalability and flexibility.
+Disaggregated inference refers to the process of splitting the inference of LLMs into distinct
+phases. This architecture, facilitated by libraries like Mooncake, uses high-bandwidth
+RDMA to transfer the Key-Value (KV) cache between prefill and decode nodes.
+This allows for independent resource scaling and optimization, resulting in
+improved efficiency and throughput.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
+
+ {% set docker = data.dockers[0] %}
+
+ `SGLang `__ is a high-performance inference and
+ serving engine for large language models (LLMs) and vision models. The
+ ROCm-enabled `SGLang base Docker image <{{ docker.docker_hub_url }}>`__
+ bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X series
+ accelerators. It includes the following software components:
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Software component
+ - Version
+
+ {% for component_name, component_version in docker.components.items() %}
+ * - {{ component_name }}
+ - {{ component_version }}
+ {% endfor %}
+
+The following guides on setting up and running SGLang and Mooncake for disaggregated
+distributed inference on a Slurm cluster using AMD Instinct MI300X series accelerators backed by
+Mellanox CX-7 NICs.
+
+Prerequisites
+=============
+
+Before starting, ensure you have:
+
+* A Slurm cluster with at least three nodes: one for the proxy, one for prefill (``xP``), and one for decode (``yD``).
+
+ ``Nodes -> xP + yD + 1``
+
+* A Dockerized environment with SGLang, Mooncake, etcd, and NIC drivers built in. See :ref:`sglang-disagg-inf-build-docker-image` for instructions.
+
+* A shared filesystem for storing models, scripts, and logs (cluster-specific).
+
+Supported models
+================
+
+The following models are supported for SGLang disaggregated prefill/decode
+inference. Some instructions, commands, and recommendations in this
+documentation might vary by selected model.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
+
+ {% set model_groups = data.model_groups %}
+ .. raw:: html
+
+
+
+
Model type
+
+ {% for model_group in model_groups %}
+
{{ model_group.group }}
+ {% endfor %}
+
+
+
+
+
Model
+
+ {% for model_group in model_groups %}
+ {% set models = model_group.models %}
+ {% for model in models %}
+ {% if models|length % 3 == 0 %}
+
{{ model.model }}
+ {% else %}
+
{{ model.model }}
+ {% endif %}
+ {% endfor %}
+ {% endfor %}
+
+
+
+
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.model_repo }}
+
+ .. note::
+
+ See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`__ to learn more about this model.
+ Some models require access authorization prior to use through an external license agreement with a third party.
+
+ {% endfor %}
+ {% endfor %}
+
+.. _sglang-disagg-inf-build-docker-image:
+
+Build the Docker image
+----------------------
+
+Get the Dockerfile located in
+``__.
+It uses `lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
+`__
+as the base Docker image and installs the necessary components for Mooncake, etcd, and Mellanox network
+drivers.
+
+.. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD.git
+ cd MAD/docker
+ docker build \
+ -t sglang_disagg_pd_image \
+ -f sglang_disagg_inference.ubuntu.amd.Dockerfile .
+
+Benchmarking
+============
+
+The ``__
+repository contains scripts to launch SGLang inference with prefill/decode
+disaggregation via Mooncake for supported models.
+
+* `scripts/sglang_dissag/run_xPyD_models.slurm `__
+ -- the main Slurm batch script to launch Docker containers on all nodes using ``sbatch`` or ``salloc``.
+
+* `scripts/sglang_dissag/sglang_disagg_server.sh `__
+ -- the entrypoint script that runs inside each container to start the correct service -- proxy, prefill, or decode.
+
+* `scripts/sglang_dissag/benchmark_xPyD.sh `__
+ -- the benchmark script to run the GSM8K accuracy benchmark and the SGLang benchmarking tool for performance measurement.
+
+* `scripts/sglang_dissag/benchmark_parser.py `__
+ -- the log parser script to be run on the concurrency benchmark log file to generate tabulated data.
+
+Launch the service
+------------------
+
+The service is deployed using a Slurm batch script that orchestrates the containers across the
+allocated nodes.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-distributed-benchmark-models.yaml
+
+ {% set model_groups = data.model_groups %}
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.model_repo }}
+
+ .. code-block:: shell
+
+ # Clone the MAD repo if you haven't already and
+ # navigate to the scripts directory
+ git clone https://github.com/ROCm/MAD.git
+ cd MAD/scripts/sglang_disagg/
+
+ # Slurm sbatch run command
+ export DOCKER_IMAGE_NAME=sglang_disagg_pd_image
+ export xP=
+ export yD=
+ export MODEL_NAME={{ model.model_repo }}
+ # num_nodes = xP + yD + 1
+ sbatch -N -n --nodelist= run_xPyD_models.slurm
+
+ {% endfor %}
+ {% endfor %}
+
+Post-run logs and testing
+-------------------------
+
+Logs are stored in your shared filesystem in the directory specified by the ``LOG_PATH`` variable in the Slurm script.
+A new directory named after the Slurm job ID is created for each run.
+
+Inside that directory, you can access various logs:
+
+* ``pd_sglang_bench_serving.sh_NODE<...>.log`` -- the main log for each server node.
+
+* ``etcd_NODE<...>.log`` -- logs for etcd services.
+
+* ``prefill_NODE<...>.log`` -- logs for the prefill services.
+
+* ``decode_NODE<...>.log`` -- logs for the decode services.
+
+Use the benchmark parser script for concurrency logs to tabulate different data.
+
+.. code-block:: shell
+
+ python3 benchmark_parser.py
+
+To verify the service is responsive, you can try sending a ``curl`` request to test the launched
+server from the Docker container on the proxy node. For example:
+
+.. code-block:: shell
+
+ curl -X POST http://127.0.0.1:30000/generate \
+ -H "Content-Type: application/json" \
+ -d '{ "text": "Let me tell you a story ", "sampling_params": { "temperature": 0.3 } }'
+
+Known issues
+============
+
+When running larger models, such as DeepSeek-V3 and Llama-3.1-405B-Instruct-FP8-KV, at
+higher concurrency levels (512+), the following error might occur:
+
+.. code-block:: shell-session
+
+ `__.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+ see ``__.
+
+- See the base upstream Docker image on `Docker Hub `__.
+
+- To learn more about system settings and management practices to configure your system for
+ MI300X series accelerators, see `AMD Instinct MI300X system optimization `__.
+
+- For application performance optimization strategies for HPC and AI workloads,
+ including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+ :doc:`Running models from Hugging Face `.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+ :doc:`Fine-tuning LLMs and inference optimization `.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+ `AMD Infinity Hub `_.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/sglang-history` to find documentation for previous releases
+of SGLang inference performance testing.
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
index f2b060ebd..0802441d8 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -1,6 +1,5 @@
.. meta::
- :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
- ROCm vLLM Docker image.
+ :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
:keywords: model, MAD, automation, dashboarding, validate
**********************************
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
index 76c3582e7..1b6a404fc 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -3,7 +3,7 @@
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
******************************************
-Training a model with JAX MaxText for ROCm
+Training a model with JAX MaxText on ROCm
******************************************
MaxText is a high-performance, open-source framework built on the Google JAX
@@ -406,8 +406,6 @@ benchmark results:
Further reading
===============
-- See the ROCm/maxtext benchmarking README at ``__.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__.
- To learn more about system settings and management practices to configure your system for
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
index 4df1da960..5a2f610d4 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -5,15 +5,13 @@
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
******************************************
-Training a model with Megatron-LM for ROCm
+Training a model with Megatron-LM on ROCm
******************************************
.. caution::
- The ROCm Megatron-LM framework now has limited support with this Docker
- environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
-
- To learn how to migrate your existing workloads to Primus with Megatron-Core,
+ Primus with Megatron supersedes this ROCm Megatron-LM training workflow.
+ To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
The `Megatron-LM framework for ROCm `_ is
@@ -807,9 +805,16 @@ Single node training
AC=none \
SEQ_LEN=4096 \
PAD_LEN=4096 \
- TRAIN_ITERS=50 \
+ TRAIN_ITERS=20 \
bash examples/deepseek_v2/train_deepseekv2.sh
+ .. note::
+
+ Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
+ for large iterations.
+ For stability, it's recommended to use Primus for this workload.
+ See :doc:`primus-megatron`.
+
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
index c86508103..de9e44a8c 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.rst
@@ -3,7 +3,7 @@
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
******************************************
-Training MPT-30B with LLM Foundry and ROCm
+Training MPT-30B with LLM Foundry on ROCm
******************************************
MPT-30B is a 30-billion parameter decoder-style transformer-based model from
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
index 07d640159..16d5e3f9d 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -4,7 +4,7 @@
PyTorch training performance testing version history
****************************************************
-This table lists previous versions of the ROCm Megatron-LM training Docker image for
+This table lists previous versions of the ROCm PyTorch training Docker image for
inference performance testing. For detailed information about available models
for benchmarking, see the version-specific documentation. You can find tagged
previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub `_.
@@ -16,12 +16,21 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
- Components
- Resources
+ * - v25.8 (latest)
+ -
+ * ROCm 6.4.3
+ * PyTorch 2.8.0a0+gitd06a406
+ -
+ * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
+ * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+ * `Docker Hub `__
+
* - v25.7
-
* ROCm 6.4.2
* PyTorch 2.8.0a0+gitd06a406
-
- * :doc:`Documentation <../pytorch-training>`
+ * :doc:`Documentation `
* `Docker Hub `__
* - v25.6
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7.rst
new file mode 100644
index 000000000..43b9a02e5
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7.rst
@@ -0,0 +1,567 @@
+:orphan:
+
+.. meta::
+ :description: How to train a model using PyTorch for ROCm.
+ :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+ This documentation does not reflect the latest version of ROCm vLLM
+ performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
+
+ {% set dockers = data.dockers %}
+ {% set docker = dockers[0] %}
+ The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+ (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+ model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+ training workloads:
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Software component
+ - Version
+
+ {% for component_name, component_version in docker.components.items() %}
+ * - {{ component_name }}
+ - {{ component_version }}
+ {% endfor %}
+
+.. _amd-pytorch-training-model-support-v257:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
+
+ {% set unified_docker = data.dockers[0] %}
+ {% set model_groups = data.model_groups %}
+ .. raw:: html
+
+
+
+
Model
+
+ {% for model_group in model_groups %}
+
{{ model_group.group }}
+ {% endfor %}
+
+
+
+
+
Variant
+
+ {% for model_group in model_groups %}
+ {% set models = model_group.models %}
+ {% for model in models %}
+ {% if models|length % 3 == 0 %}
+
{{ model.model }}
+ {% else %}
+
{{ model.model }}
+ {% endif %}
+ {% endfor %}
+ {% endfor %}
+
+
+
+
+
+ .. _amd-pytorch-training-supported-training-modes-v257:
+
+ The following table lists supported training modes per model.
+
+ .. dropdown:: Supported training modes
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Model
+ - Supported training modes
+
+ {% for model_group in model_groups %}
+ {% set models = model_group.models %}
+ {% for model in models %}
+ * - {{ model.model }}
+ - ``{{ model.training_modes | join('``, ``') }}``
+
+ {% endfor %}
+ {% endfor %}
+
+ .. note::
+
+ Some model and fine-tuning combinations are not listed. This is
+ because the `upstream torchtune repository `__
+ doesn't provide default YAML configurations for them.
+ For advanced usage, you can create a custom configuration to enable
+ unlisted fine-tuning methods by using an existing file in the
+ ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements-v257:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software `_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+ The performance data presented in
+ `Performance results with AMD ROCm software `_
+ should not be interpreted as the peak performance achievable by AMD
+ Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization ` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Run training
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.7-benchmark-models.yaml
+
+ {% set unified_docker = data.dockers[0] %}
+ {% set model_groups = data.model_groups %}
+
+ Once the setup is complete, choose between two options to start benchmarking training:
+
+ .. tab-set::
+
+ .. tab-item:: MAD-integrated benchmarking
+
+ 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local
+ directory and install the required packages on the host machine.
+
+ .. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD
+ cd MAD
+ pip install -r requirements.txt
+
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.mad_tag }}
+
+ 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+ using one node with the {{ model.precision }} data type on the host machine.
+
+ .. code-block:: shell
+
+ export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+ madengine run \
+ --tags {{ model.mad_tag }} \
+ --keep-model-dir \
+ --live-output \
+ --timeout 28800
+
+ MAD launches a Docker container with the name
+ ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+ model are collected in ``~/MAD/perf.csv``.
+
+ {% endfor %}
+ {% endfor %}
+
+ .. tab-item:: Standalone benchmarking
+
+ .. rubric:: Download the Docker image and required packages
+
+ 1. Use the following command to pull the Docker image from Docker Hub.
+
+ .. code-block:: shell
+
+ docker pull {{ unified_docker.pull_tag }}
+
+ 2. Run the Docker container.
+
+ .. code-block:: shell
+
+ docker run -it \
+ --device /dev/dri \
+ --device /dev/kfd \
+ --network host \
+ --ipc host \
+ --group-add video \
+ --cap-add SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --privileged \
+ -v $HOME:$HOME \
+ -v $HOME/.ssh:/root/.ssh \
+ --shm-size 64G \
+ --name training_env \
+ {{ unified_docker.pull_tag }}
+
+ Use these commands if you exit the ``training_env`` container and need to return to it.
+
+ .. code-block:: shell
+
+ docker start training_env
+ docker exec -it training_env bash
+
+ 3. In the Docker container, clone the ``__
+ repository and navigate to the benchmark scripts directory
+ ``/workspace/MAD/scripts/pytorch_train``.
+
+ .. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD
+ cd MAD/scripts/pytorch_train
+
+ .. rubric:: Prepare training datasets and dependencies
+
+ 1. The following benchmarking examples require downloading models and datasets
+ from Hugging Face. To ensure successful access to gated repos, set your
+ ``HF_TOKEN``.
+
+ .. code-block:: shell
+
+ export HF_TOKEN=$your_personal_hugging_face_access_token
+
+ 2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_setup.sh
+
+ .. container:: model-doc pyt_train_llama-3.1-8b
+
+ ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Library
+ - Reference
+
+ * - ``accelerate``
+ - `Hugging Face Accelerate `_
+
+ * - ``datasets``
+ - `Hugging Face Datasets `_ 3.2.0
+
+ .. container:: model-doc pyt_train_llama-3.1-70b
+
+ ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Library
+ - Reference
+
+ * - ``datasets``
+ - `Hugging Face Datasets `_ 3.2.0
+
+ * - ``torchdata``
+ - `TorchData `_
+
+ * - ``tomli``
+ - `Tomli `_
+
+ * - ``tiktoken``
+ - `tiktoken `_
+
+ * - ``blobfile``
+ - `blobfile `_
+
+ * - ``tabulate``
+ - `tabulate `_
+
+ * - ``wandb``
+ - `Weights & Biases `_
+
+ * - ``sentencepiece``
+ - `SentencePiece `_ 0.2.0
+
+ * - ``tensorboard``
+ - `TensorBoard `_ 2.18.0
+
+ .. container:: model-doc pyt_train_flux
+
+ ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Library
+ - Reference
+
+ * - ``accelerate``
+ - `Hugging Face Accelerate `_
+
+ * - ``datasets``
+ - `Hugging Face Datasets `_ 3.2.0
+
+ * - ``sentencepiece``
+ - `SentencePiece `_ 0.2.0
+
+ * - ``tensorboard``
+ - `TensorBoard `_ 2.18.0
+
+ * - ``csvkit``
+ - `csvkit `_ 2.0.1
+
+ * - ``deepspeed``
+ - `DeepSpeed `_ 0.16.2
+
+ * - ``diffusers``
+ - `Hugging Face Diffusers `_ 0.31.0
+
+ * - ``GitPython``
+ - `GitPython `_ 3.1.44
+
+ * - ``opencv-python-headless``
+ - `opencv-python-headless `_ 4.10.0.84
+
+ * - ``peft``
+ - `PEFT `_ 0.14.0
+
+ * - ``protobuf``
+ - `Protocol Buffers `_ 5.29.2
+
+ * - ``pytest``
+ - `PyTest `_ 8.3.4
+
+ * - ``python-dotenv``
+ - `python-dotenv `_ 1.0.1
+
+ * - ``seaborn``
+ - `Seaborn `_ 0.13.2
+
+ * - ``transformers``
+ - `Transformers `_ 4.47.0
+
+ ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+ * `bghira/pseudo-camera-10k `_
+
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+ {% set training_modes = model.training_modes %}
+ {% set training_mode_descs = {
+ "pretrain": "Benchmark pre-training.",
+ "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+ } %}
+ {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+ {% if available_modes %}
+
+ .. container:: model-doc {{ model.mad_tag }}
+
+ .. rubric:: Pre-training
+
+ To start the pre-training benchmark, use the following command with the
+ appropriate options. See the following list of options and their descriptions.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+ -m {{ model.model_repo }} \
+ -p $datatype \
+ -s $sequence_length
+
+ {% if model.mad_tag == "pyt_train_flux" %}
+ .. container:: model-doc {{ model.mad_tag }}
+
+ .. note::
+
+ Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+ To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
+
+ Occasionally, downloading the Flux dataset might fail. In the event of this
+ error, manually download it from Hugging Face at
+ `black-forest-labs/FLUX.1-dev `_
+ and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+ the required dataset.
+ {% endif %}
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Name
+ - Options
+ - Description
+
+ {% for mode in available_modes %}
+ * - {% if loop.first %}``$training_mode``{% endif %}
+ - ``{{ mode }}``
+ - {{ training_mode_descs[mode] }}
+ {% endfor %}
+
+ * - ``$datatype``
+ - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+ - Only Llama 3.1 8B supports FP8 precision.
+
+ * - ``$sequence_length``
+ - Sequence length for the language model.
+ - Between 2048 and 8192. 8192 by default.
+ {% endif %}
+
+ {% set training_mode_descs = {
+ "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+ "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+ "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+ "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+ } %}
+ {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+ {% if available_modes %}
+ .. container:: model-doc {{ model.mad_tag }}
+
+ .. rubric:: Fine-tuning
+
+ To start the fine-tuning benchmark, use the following command with the
+ appropriate options. See the following list of options and their descriptions.
+ See :ref:`supported training modes `.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_report.sh -t $training_mode \
+ -m {{ model.model_repo }} \
+ -p $datatype \
+ -s $sequence_length
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Name
+ - Options
+ - Description
+
+ {% for mode in available_modes %}
+ * - {% if loop.first %}``$training_mode``{% endif %}
+ - ``{{ mode }}``
+ - {{ training_mode_descs[mode] }}
+ {% endfor %}
+
+ * - ``$datatype``
+ - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+ - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
+
+ * - ``$sequence_length``
+ - Between 2048 and 16384.
+ - Sequence length for the language model.
+
+ {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
+ .. note::
+
+ For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+ use the following torchtune commit for compatibility:
+
+ .. code-block:: shell
+
+ git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+ {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+ .. note::
+
+ You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+ input tensor should be smaller than max_seq_len (4096)``.
+ This error indicates that an input sequence is longer than the model's maximum context window.
+
+ Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+ tokens in this case). You can resolve this by truncating the input or splitting
+ it into smaller chunks before passing it to the model.
+
+ Note on reproducibility: The results in this guide are based on
+ commit ``b4c98ac`` from the upstream
+ ``__ repository. For the
+ latest updates, you can use the main branch.
+
+ {% endif %}
+ {% endif %}
+ {% endfor %}
+ {% endfor %}
+
+ .. rubric:: Benchmarking examples
+
+ For examples of benchmarking commands, see ``__.
+
+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+ # In the MAD repository
+ cd scripts/pytorch_train
+ sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+ huggingface-cli login # Get access to HF Llama model space
+ huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+ # In the MAD repository
+ cd scripts/pytorch_train
+ sbatch Torchtune_Multinode.sh
+
+.. note::
+
+ Information regarding benchmark setup:
+
+ * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+ * You can adjust the torchtune `YAML configuration file
+ `__
+ if you're using a different model.
+ * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+ * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__.
+
+- To learn more about system settings and management practices to configure your system for
+ AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+ `AMD Infinity Hub `_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
new file mode 100644
index 000000000..51f1ce57e
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -0,0 +1,305 @@
+.. meta::
+ :description: How to train a model using PyTorch for ROCm.
+ :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+****************************************
+Training a model with Primus and PyTorch
+****************************************
+
+`Primus `__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+Primus now supports the PyTorch torchtitan backend.
+
+.. note::
+
+ Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training ` workflow.
+ See :doc:`pytorch-training` to see steps to run workloads without Primus.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+ {% set dockers = data.dockers %}
+ {% set docker = dockers[0] %}
+ For ease of use, AMD provides a ready-to-use Docker image -- ``{{
+ docker.pull_tag }}`` -- for MI300X series accelerators containing essential
+ components for Primus and PyTorch training with
+ Primus Turbo optimizations.
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Software component
+ - Version
+
+ {% for component_name, component_version in docker.components.items() %}
+ * - {{ component_name }}
+ - {{ component_version }}
+ {% endfor %}
+
+.. _amd-primus-pytorch-model-support-v258:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+ {% set unified_docker = data.dockers[0] %}
+ {% set model_groups = data.model_groups %}
+ .. raw:: html
+
+
+
+
Model
+
+ {% for model_group in model_groups %}
+
{{ model_group.group }}
+ {% endfor %}
+
+
+
+
+
Model
+
+ {% for model_group in model_groups %}
+ {% set models = model_group.models %}
+ {% for model in models %}
+ {% if models|length % 3 == 0 %}
+
{{ model.model }}
+ {% else %}
+
{{ model.model }}
+ {% endif %}
+ {% endfor %}
+ {% endfor %}
+
+
+
+
+.. seealso::
+
+ For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
+ see the documentation :doc:`pytorch-training` (without Primus)
+
+.. _amd-primus-pytorch-performance-measurements-v258:
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization ` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+ {% set unified_docker = data.dockers[0] %}
+
+ Pull the Docker image
+ =====================
+
+ Use the following command to pull the `Docker image <{{ unified_docker.docker_hub_url }}>`_ from Docker Hub.
+
+ .. code-block:: shell
+
+ docker pull {{ unified_docker.pull_tag }}
+
+ Run training
+ ============
+
+ {% set model_groups = data.model_groups %}
+
+ Once the setup is complete, choose between the following two workflows to start benchmarking training.
+ For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
+
+ .. tab-set::
+
+ .. tab-item:: MAD-integrated benchmarking
+
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.mad_tag }}
+
+ The following run command is tailored to {{ model.model }}.
+ See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.
+
+ 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local
+ directory and install the required packages on the host machine.
+
+ .. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD
+ cd MAD
+ pip install -r requirements.txt
+
+ 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+ using one node with the {{ model.precision }} data type on the host machine.
+
+ .. code-block:: shell
+
+ export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+ madengine run \
+ --tags {{ model.mad_tag }} \
+ --keep-model-dir \
+ --live-output \
+ --timeout 28800
+
+ MAD launches a Docker container with the name
+ ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+ model are collected in ``~/MAD/perf.csv``.
+
+ {% endfor %}
+ {% endfor %}
+
+ .. tab-item:: Standalone benchmarking
+
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.mad_tag }}
+
+ The following run commands are tailored to {{ model.model }}.
+ See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.
+
+ .. rubric:: Download the Docker image and required packages
+
+ 1. Use the following command to pull the Docker image from Docker Hub.
+
+ .. code-block:: shell
+
+ docker pull {{ unified_docker.pull_tag }}
+
+ 2. Run the Docker container.
+
+ .. code-block:: shell
+
+ docker run -it \
+ --device /dev/dri \
+ --device /dev/kfd \
+ --network host \
+ --ipc host \
+ --group-add video \
+ --cap-add SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --privileged \
+ -v $HOME:$HOME \
+ -v $HOME/.ssh:/root/.ssh \
+ --shm-size 64G \
+ --name training_env \
+ {{ unified_docker.pull_tag }}
+
+ Use these commands if you exit the ``training_env`` container and need to return to it.
+
+ .. code-block:: shell
+
+ docker start training_env
+ docker exec -it training_env bash
+
+ 3. In the Docker container, clone the ``__
+ repository and navigate to the benchmark scripts directory
+ ``/workspace/MAD/scripts/pytorch_train``.
+
+ .. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD
+ cd MAD/scripts/pytorch_train
+
+ .. rubric:: Prepare training datasets and dependencies
+
+ 1. The following benchmarking examples require downloading models and datasets
+ from Hugging Face. To ensure successful access to gated repos, set your
+ ``HF_TOKEN``.
+
+ .. code-block:: shell
+
+ export HF_TOKEN=$your_personal_hugging_face_access_token
+
+ 2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_setup.sh
+
+ .. rubric:: Pretraining
+
+ To start the pretraining benchmark, use the following command with the
+ appropriate options. See the following list of options and their descriptions.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_report.sh -t pretrain \
+ -m {{ model.model_repo }} \
+ -p $datatype \
+ -s $sequence_length
+
+
+ .. list-table::
+ :header-rows: 1
+
+ * - Name
+ - Options
+ - Description
+
+ {% for mode in available_modes %}
+ * - {% if loop.first %}``$training_mode``{% endif %}
+ - ``{{ mode }}``
+ - {{ training_mode_descs[mode] }}
+ {% endfor %}
+
+ * - ``$datatype``
+ - ``BF16``{% if model.mad_tag == "primus_pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+ - Currently, only Llama 3.1 8B supports FP8 precision.
+
+ * - ``$sequence_length``
+ - Sequence length for the language model.
+ - Between 2048 and 8192. 8192 by default.
+
+ .. rubric:: Benchmarking examples
+
+ Use the following command to run train {{ model.model }} with BF16 precision using Primus torchtitan.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_report.sh -m {{ model.model_repo }}
+
+ To train {{ model.model }} with FP8 precision, use the following command.
+
+ .. code-block:: shell
+
+ ./pytorch_benchmark_report.sh -m {{ model.model_repo }} -p FP8
+ {% endfor %}
+ {% endfor %}
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+ Framework for Large Models on AMD GPUs `__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__.
+
+- To learn more about system settings and management practices to configure your system for
+ AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+ `AMD Infinity Hub `_.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
index d8ab01318..f2e52fc65 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -1,11 +1,18 @@
+:orphan:
+
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
**************************************
-Training a model with PyTorch for ROCm
+Training a model with PyTorch on ROCm
**************************************
+.. note::
+
+ Primus with the PyTorch torchtitan backend is intended to supersede the :doc:`ROCm PyTorch training ` workflow.
+ See :doc:`primus-pytorch` for details.
+
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
@@ -49,7 +56,7 @@ vary by model -- select one to get started.
Model
{% for model_group in model_groups %}
-
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %}
@@ -87,9 +94,11 @@ vary by model -- select one to get started.
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
+ {% if model.training_modes %}
* - {{ model.model }}
- ``{{ model.training_modes | join('``, ``') }}``
+ {% endif %}
{% endfor %}
{% endfor %}
@@ -152,20 +161,23 @@ Run training
.. tab-item:: MAD-integrated benchmarking
- 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local
- directory and install the required packages on the host machine.
-
- .. code-block:: shell
-
- git clone https://github.com/ROCm/MAD
- cd MAD
- pip install -r requirements.txt
-
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
+ The following run command is tailored to {{ model.model }}.
+ See :ref:`amd-pytorch-training-model-support` to switch to another available model.
+
+ 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local
+ directory and install the required packages on the host machine.
+
+ .. code-block:: shell
+
+ git clone https://github.com/ROCm/MAD
+ cd MAD
+ pip install -r requirements.txt
+
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one node with the {{ model.precision }} data type on the host machine.
@@ -187,6 +199,17 @@ Run training
.. tab-item:: Standalone benchmarking
+ {% for model_group in model_groups %}
+ {% for model in model_group.models %}
+
+ .. container:: model-doc {{ model.mad_tag }}
+
+ The following commands are tailored to {{ model.model }}.
+ See :ref:`amd-pytorch-training-model-support` to switch to another available model.
+
+ {% endfor %}
+ {% endfor %}
+
.. rubric:: Download the Docker image and required packages
1. Use the following command to pull the Docker image from Docker Hub.
@@ -388,7 +411,7 @@ Run training
.. note::
Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
- To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+ To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index b8cbd0e31..e15c1039f 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -12,14 +12,14 @@ subtrees:
- file: compatibility/compatibility-matrix.rst
title: Compatibility matrix
entries:
- - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/reference/system-requirements.html
+ - url: https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html
title: Linux system requirements
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
title: Windows system requirements
- caption: Install
entries:
- - url: https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/
+ - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/
title: ROCm on Linux
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/latest/
title: HIP SDK on Windows
@@ -67,9 +67,9 @@ subtrees:
subtrees:
- entries:
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
- title: Train a model with Primus and Megatron-Core
- - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
- title: Train a model with PyTorch
+ title: Train a model with Primus and Megatron-LM
+ - file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+ title: Train a model with Primus and PyTorch
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
title: Train a model with JAX MaxText
- file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
@@ -106,6 +106,8 @@ subtrees:
title: PyTorch inference performance testing
- file: how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
title: SGLang inference performance testing
+ - file: how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
+ title: SGLang distributed inference with Mooncake
- file: how-to/rocm-for-ai/inference/deploy-your-model.rst
title: Deploy your model
diff --git a/docs/sphinx/static/css/vllm-benchmark.css b/docs/sphinx/static/css/vllm-benchmark.css
index 231bb2cac..7fde027ea 100644
--- a/docs/sphinx/static/css/vllm-benchmark.css
+++ b/docs/sphinx/static/css/vllm-benchmark.css
@@ -60,7 +60,7 @@ div[data-param-k="model-group"][data-param-state="disabled"] {
.model-param-head {
background-color: var(--compat-head-color);
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
- border-right: solid 4px var(--compat-accent-color);
+ border-right: solid 3px var(--compat-accent-color);
font-weight: 600;
}
diff --git a/manifest_700.xml b/manifest_700.xml
deleted file mode 100644
index 4f7d505d8..000000000
--- a/manifest_700.xml
+++ /dev/null
@@ -1,80 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/tools/rocm-build/rocm-7.0.0.xml b/tools/rocm-build/rocm-7.0.0.xml
new file mode 100644
index 000000000..bc23b4e58
--- /dev/null
+++ b/tools/rocm-build/rocm-7.0.0.xml
@@ -0,0 +1,70 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+