mirror of
https://github.com/ROCm/ROCm.git
synced 2026-02-01 09:55:00 -05:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69360270f5 |
@@ -39,6 +39,7 @@ additional licenses. Please review individual repositories for more information.
|
|||||||
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
|
||||||
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
| [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
|
||||||
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
|
||||||
|
| [hipDNN](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipdnn/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipdnn/LICENSE.md) |
|
||||||
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
|
||||||
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
| [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
|
||||||
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
| [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
|
||||||
|
|||||||
@@ -138,14 +138,12 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.11", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.11", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||||
@@ -156,12 +154,10 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.11", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.11", "os": ["linux"]},
|
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
docker:
|
docker:
|
||||||
pull_tag: rocm/primus:v26.1
|
pull_tag: rocm/primus:v25.10
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
|
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||||
components:
|
components:
|
||||||
ROCm: 7.1.0
|
ROCm: 7.1.0
|
||||||
|
Primus: 0.3.0
|
||||||
|
Primus Turbo: 0.1.1
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.6.0.dev0+f141f34b
|
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||||
Flash Attention: 2.8.3
|
Flash Attention: 2.8.3
|
||||||
hipBLASLt: 34459f66ea
|
hipBLASLt: 1.2.0-09ab7153e2
|
||||||
Triton: 3.4.0
|
Triton: 3.4.0
|
||||||
RCCL: 2.27.7
|
RCCL: 2.27.7
|
||||||
model_groups:
|
model_groups:
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
docker:
|
|
||||||
pull_tag: rocm/primus:v25.11
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
|
|
||||||
components:
|
|
||||||
ROCm: 7.1.0
|
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
|
||||||
Python: "3.10"
|
|
||||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
|
||||||
Flash Attention: 2.8.3
|
|
||||||
hipBLASLt: 1.2.0-09ab7153e2
|
|
||||||
Triton: 3.4.0
|
|
||||||
RCCL: 2.27.7
|
|
||||||
model_groups:
|
|
||||||
- group: Meta Llama
|
|
||||||
tag: llama
|
|
||||||
models:
|
|
||||||
- model: Llama 3.3 70B
|
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
|
||||||
- model: Llama 2 7B
|
|
||||||
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
|
||||||
- model: Llama 2 70B
|
|
||||||
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek-V3 (proxy)
|
|
||||||
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
|
||||||
- model: DeepSeek-V2-Lite
|
|
||||||
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
|
||||||
- group: Mistral AI
|
|
||||||
tag: mistral
|
|
||||||
models:
|
|
||||||
- model: Mixtral 8x7B
|
|
||||||
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
|
||||||
- model: Mixtral 8x22B (proxy)
|
|
||||||
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
|
||||||
- group: Qwen
|
|
||||||
tag: qwen
|
|
||||||
models:
|
|
||||||
- model: Qwen 2.5 7B
|
|
||||||
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
|
||||||
- model: Qwen 2.5 72B
|
|
||||||
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
docker:
|
|
||||||
pull_tag: rocm/primus:v25.11
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
|
|
||||||
components:
|
|
||||||
ROCm: 7.1.0
|
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
|
||||||
Python: "3.10"
|
|
||||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
|
||||||
Flash Attention: 2.8.3
|
|
||||||
hipBLASLt: 1.2.0-09ab7153e2
|
|
||||||
Triton: 3.4.0
|
|
||||||
RCCL: 2.27.7
|
|
||||||
model_groups:
|
|
||||||
- group: Meta Llama
|
|
||||||
tag: llama
|
|
||||||
models:
|
|
||||||
- model: Llama 3.3 70B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
|
||||||
config_name: llama3.3_70B-pretrain.yaml
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
|
||||||
config_name: llama3.1_70B-pretrain.yaml
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
|
||||||
config_name: llama3.1_8B-pretrain.yaml
|
|
||||||
- model: Llama 2 7B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
|
||||||
config_name: llama2_7B-pretrain.yaml
|
|
||||||
- model: Llama 2 70B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
|
||||||
config_name: llama2_70B-pretrain.yaml
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek-V3 (proxy)
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
|
||||||
config_name: deepseek_v3-pretrain.yaml
|
|
||||||
- model: DeepSeek-V2-Lite
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
|
||||||
config_name: deepseek_v2_lite-pretrain.yaml
|
|
||||||
- group: Mistral AI
|
|
||||||
tag: mistral
|
|
||||||
models:
|
|
||||||
- model: Mixtral 8x7B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
|
||||||
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
|
||||||
- model: Mixtral 8x22B (proxy)
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
|
||||||
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
|
||||||
- group: Qwen
|
|
||||||
tag: qwen
|
|
||||||
models:
|
|
||||||
- model: Qwen 2.5 7B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
|
||||||
config_name: primus_qwen2.5_7B-pretrain.yaml
|
|
||||||
- model: Qwen 2.5 72B
|
|
||||||
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
|
||||||
config_name: qwen2.5_72B-pretrain.yaml
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
docker:
|
|
||||||
pull_tag: rocm/primus:v25.11
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
|
|
||||||
components:
|
|
||||||
ROCm: 7.1.0
|
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
|
||||||
Python: "3.10"
|
|
||||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
|
||||||
Flash Attention: 2.8.3
|
|
||||||
hipBLASLt: 1.2.0-09ab7153e2
|
|
||||||
model_groups:
|
|
||||||
- group: Meta Llama
|
|
||||||
tag: llama
|
|
||||||
models:
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: primus_pyt_train_llama-3.1-8b
|
|
||||||
model_repo: Llama-3.1-8B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
|
||||||
precision: BF16
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: primus_pyt_train_llama-3.1-70b
|
|
||||||
model_repo: Llama-3.1-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
|
||||||
precision: BF16
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek V3 16B
|
|
||||||
mad_tag: primus_pyt_train_deepseek-v3-16b
|
|
||||||
model_repo: DeepSeek-V3
|
|
||||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
|
||||||
precision: BF16
|
|
||||||
@@ -1,195 +0,0 @@
|
|||||||
docker:
|
|
||||||
pull_tag: rocm/primus:v25.11
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
|
|
||||||
components:
|
|
||||||
ROCm: 7.1.0
|
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
|
||||||
Python: "3.10"
|
|
||||||
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
|
||||||
Flash Attention: 2.8.3
|
|
||||||
hipBLASLt: 1.2.0-09ab7153e2
|
|
||||||
model_groups:
|
|
||||||
- group: Meta Llama
|
|
||||||
tag: llama
|
|
||||||
models:
|
|
||||||
- model: Llama 4 Scout 17B-16E
|
|
||||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
|
||||||
model_repo: Llama-4-17B_16E
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 3.3 70B
|
|
||||||
mad_tag: pyt_train_llama-3.3-70b
|
|
||||||
model_repo: Llama-3.3-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
|
||||||
- model: Llama 3.2 1B
|
|
||||||
mad_tag: pyt_train_llama-3.2-1b
|
|
||||||
model_repo: Llama-3.2-1B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 3.2 3B
|
|
||||||
mad_tag: pyt_train_llama-3.2-3b
|
|
||||||
model_repo: Llama-3.2-3B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 3.2 Vision 11B
|
|
||||||
mad_tag: pyt_train_llama-3.2-vision-11b
|
|
||||||
model_repo: Llama-3.2-Vision-11B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw]
|
|
||||||
- model: Llama 3.2 Vision 90B
|
|
||||||
mad_tag: pyt_train_llama-3.2-vision-90b
|
|
||||||
model_repo: Llama-3.2-Vision-90B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw]
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: pyt_train_llama-3.1-8b
|
|
||||||
model_repo: Llama-3.1-8B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: pyt_train_llama-3.1-70b
|
|
||||||
model_repo: Llama-3.1-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain, finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 3.1 405B
|
|
||||||
mad_tag: pyt_train_llama-3.1-405b
|
|
||||||
model_repo: Llama-3.1-405B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_qlora]
|
|
||||||
- model: Llama 3 8B
|
|
||||||
mad_tag: pyt_train_llama-3-8b
|
|
||||||
model_repo: Llama-3-8B
|
|
||||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 3 70B
|
|
||||||
mad_tag: pyt_train_llama-3-70b
|
|
||||||
model_repo: Llama-3-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 2 7B
|
|
||||||
mad_tag: pyt_train_llama-2-7b
|
|
||||||
model_repo: Llama-2-7B
|
|
||||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
|
||||||
- model: Llama 2 13B
|
|
||||||
mad_tag: pyt_train_llama-2-13b
|
|
||||||
model_repo: Llama-2-13B
|
|
||||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Llama 2 70B
|
|
||||||
mad_tag: pyt_train_llama-2-70b
|
|
||||||
model_repo: Llama-2-70B
|
|
||||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_lora, finetune_qlora]
|
|
||||||
- group: OpenAI
|
|
||||||
tag: openai
|
|
||||||
models:
|
|
||||||
- model: GPT OSS 20B
|
|
||||||
mad_tag: pyt_train_gpt_oss_20b
|
|
||||||
model_repo: GPT-OSS-20B
|
|
||||||
url: https://huggingface.co/openai/gpt-oss-20b
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [HF_finetune_lora]
|
|
||||||
- model: GPT OSS 120B
|
|
||||||
mad_tag: pyt_train_gpt_oss_120b
|
|
||||||
model_repo: GPT-OSS-120B
|
|
||||||
url: https://huggingface.co/openai/gpt-oss-120b
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [HF_finetune_lora]
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek V2 16B
|
|
||||||
mad_tag: primus_pyt_train_deepseek-v2
|
|
||||||
model_repo: DeepSeek-V2
|
|
||||||
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- group: Qwen
|
|
||||||
tag: qwen
|
|
||||||
models:
|
|
||||||
- model: Qwen 3 8B
|
|
||||||
mad_tag: pyt_train_qwen3-8b
|
|
||||||
model_repo: Qwen3-8B
|
|
||||||
url: https://huggingface.co/Qwen/Qwen3-8B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Qwen 3 32B
|
|
||||||
mad_tag: pyt_train_qwen3-32b
|
|
||||||
model_repo: Qwen3-32
|
|
||||||
url: https://huggingface.co/Qwen/Qwen3-32B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_lora]
|
|
||||||
- model: Qwen 2.5 32B
|
|
||||||
mad_tag: pyt_train_qwen2.5-32b
|
|
||||||
model_repo: Qwen2.5-32B
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_lora]
|
|
||||||
- model: Qwen 2.5 72B
|
|
||||||
mad_tag: pyt_train_qwen2.5-72b
|
|
||||||
model_repo: Qwen2.5-72B
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_lora]
|
|
||||||
- model: Qwen 2 1.5B
|
|
||||||
mad_tag: pyt_train_qwen2-1.5b
|
|
||||||
model_repo: Qwen2-1.5B
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- model: Qwen 2 7B
|
|
||||||
mad_tag: pyt_train_qwen2-7b
|
|
||||||
model_repo: Qwen2-7B
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-7B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
|
||||||
- group: Stable Diffusion
|
|
||||||
tag: sd
|
|
||||||
models:
|
|
||||||
- model: Stable Diffusion XL
|
|
||||||
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
|
|
||||||
model_repo: SDXL
|
|
||||||
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [posttrain]
|
|
||||||
- group: Flux
|
|
||||||
tag: flux
|
|
||||||
models:
|
|
||||||
- model: FLUX.1-dev
|
|
||||||
mad_tag: pyt_train_flux
|
|
||||||
model_repo: Flux
|
|
||||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [posttrain]
|
|
||||||
- group: NCF
|
|
||||||
tag: ncf
|
|
||||||
models:
|
|
||||||
- model: NCF
|
|
||||||
mad_tag: pyt_ncf_training
|
|
||||||
model_repo:
|
|
||||||
url: https://github.com/ROCm/FluxBenchmark
|
|
||||||
precision: FP32
|
|
||||||
- group: DLRM
|
|
||||||
tag: dlrm
|
|
||||||
models:
|
|
||||||
- model: DLRM v2
|
|
||||||
mad_tag: pyt_train_dlrm
|
|
||||||
model_repo: DLRM
|
|
||||||
url: https://github.com/AMD-AGI/DLRMBenchmark
|
|
||||||
training_modes: [pretrain]
|
|
||||||
@@ -1,13 +1,13 @@
|
|||||||
docker:
|
docker:
|
||||||
pull_tag: rocm/primus:v26.1
|
pull_tag: rocm/primus:v25.11
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
|
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||||
components:
|
components:
|
||||||
ROCm: 7.1.0
|
ROCm: 7.1.0
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.6.0.dev0+f141f34b
|
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||||
Flash Attention: 2.8.3
|
Flash Attention: 2.8.3
|
||||||
hipBLASLt: 34459f66ea
|
hipBLASLt: 1.2.0-09ab7153e2
|
||||||
Triton: 3.4.0
|
Triton: 3.4.0
|
||||||
RCCL: 2.27.7
|
RCCL: 2.27.7
|
||||||
model_groups:
|
model_groups:
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
docker:
|
docker:
|
||||||
pull_tag: rocm/primus:v26.1
|
pull_tag: rocm/primus:v25.11
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
|
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||||
components:
|
components:
|
||||||
ROCm: 7.1.0
|
ROCm: 7.1.0
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.6.0.dev0+f141f34b
|
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||||
Flash Attention: 2.8.3
|
Flash Attention: 2.8.3
|
||||||
hipBLASLt: 34459f66ea
|
hipBLASLt: 1.2.0-09ab7153e2
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
docker:
|
docker:
|
||||||
pull_tag: rocm/primus:v26.1
|
pull_tag: rocm/primus:v25.10
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
|
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
|
||||||
components:
|
components:
|
||||||
ROCm: 7.1.0
|
ROCm: 7.1.0
|
||||||
|
Primus: 0.3.0
|
||||||
|
Primus Turbo: 0.1.1
|
||||||
PyTorch: 2.10.0.dev20251112+rocm7.1
|
PyTorch: 2.10.0.dev20251112+rocm7.1
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.6.0.dev0+f141f34b
|
Transformer Engine: 2.4.0.dev0+32e2d1d4
|
||||||
Flash Attention: 2.8.3
|
Flash Attention: 2.8.3
|
||||||
hipBLASLt: 34459f66ea
|
hipBLASLt: 1.2.0-09ab7153e2
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ accelerate training workloads:
|
|||||||
- {{ component_version }}
|
- {{ component_version }}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support-v26.01:
|
.. _amd-megatron-lm-model-support-v25.11:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -97,7 +97,7 @@ accelerate training workloads:
|
|||||||
Some models, such as Llama, require an external license agreement through
|
Some models, such as Llama, require an external license agreement through
|
||||||
a third party (for example, Meta).
|
a third party (for example, Meta).
|
||||||
|
|
||||||
.. _amd-megatron-lm-performance-measurements-v26.01:
|
.. _amd-megatron-lm-performance-measurements-v25.11:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -129,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
|||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
system's configuration.
|
system's configuration.
|
||||||
|
|
||||||
.. _mi300x-amd-megatron-lm-training-v26.01:
|
.. _mi300x-amd-megatron-lm-training-v25.11:
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
@@ -138,7 +138,7 @@ Use the following instructions to set up the environment, configure the script t
|
|||||||
reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
|
reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
|
||||||
image.
|
image.
|
||||||
|
|
||||||
.. _amd-megatron-lm-requirements-v26.01:
|
.. _amd-megatron-lm-requirements-v25.11:
|
||||||
|
|
||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
@@ -190,7 +190,7 @@ Download the Docker image
|
|||||||
The Docker container hosts a verified commit of
|
The Docker container hosts a verified commit of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup-v26.01:
|
.. _amd-megatron-lm-environment-setup-v25.11:
|
||||||
|
|
||||||
Configuration
|
Configuration
|
||||||
=============
|
=============
|
||||||
@@ -200,39 +200,39 @@ Configuration
|
|||||||
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
|
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
|
||||||
directory of
|
directory of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
|
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
|
||||||
directory of
|
directory of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
|
||||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
|
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
|
||||||
directory of
|
directory of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
|
||||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
|
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
|
||||||
directory of
|
directory of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
|
||||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
|
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
|
||||||
directory of
|
directory of
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
|
||||||
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
|
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v26.01>` for more information on configuration options.
|
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v25.11>` for more information on configuration options.
|
||||||
|
|
||||||
Multi-node configuration
|
Multi-node configuration
|
||||||
------------------------
|
------------------------
|
||||||
@@ -240,7 +240,7 @@ Multi-node configuration
|
|||||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
||||||
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
|
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
|
||||||
|
|
||||||
.. _amd-megatron-lm-tokenizer-v26.01:
|
.. _amd-megatron-lm-tokenizer-v25.11:
|
||||||
|
|
||||||
Tokenizer
|
Tokenizer
|
||||||
---------
|
---------
|
||||||
@@ -377,7 +377,7 @@ Download the dataset
|
|||||||
|
|
||||||
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
|
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
|
||||||
Remember to either pre-download the tokenizer or setup Hugging Face access
|
Remember to either pre-download the tokenizer or setup Hugging Face access
|
||||||
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v26.01>` section.
|
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v25.11>` section.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -479,13 +479,13 @@ Download the dataset
|
|||||||
|
|
||||||
Ensure that the files are accessible inside the Docker container.
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
.. _amd-megatron-lm-run-training-v26.01:
|
.. _amd-megatron-lm-run-training-v25.11:
|
||||||
|
|
||||||
Run training
|
Run training
|
||||||
============
|
============
|
||||||
|
|
||||||
Use the following example commands to set up the environment, configure
|
Use the following example commands to set up the environment, configure
|
||||||
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v26.01>`, and run training on
|
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v25.11>`, and run training on
|
||||||
MI300X Series GPUs with the AMD Megatron-LM environment.
|
MI300X Series GPUs with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
Before starting training, export the following environment variables.
|
Before starting training, export the following environment variables.
|
||||||
@@ -920,7 +920,7 @@ Single node training
|
|||||||
RECOMPUTE_ACTIVATIONS=full \
|
RECOMPUTE_ACTIVATIONS=full \
|
||||||
CKPT_FORMAT=torch_dist
|
CKPT_FORMAT=torch_dist
|
||||||
|
|
||||||
.. _amd-megatron-lm-multi-node-examples-v26.01:
|
.. _amd-megatron-lm-multi-node-examples-v25.11:
|
||||||
|
|
||||||
Multi-node training examples
|
Multi-node training examples
|
||||||
----------------------------
|
----------------------------
|
||||||
@@ -971,7 +971,7 @@ training on 16 nodes, try the following command:
|
|||||||
|
|
||||||
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
|
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
|
||||||
|
|
||||||
.. _amd-megatron-lm-benchmark-test-vars-v26.01:
|
.. _amd-megatron-lm-benchmark-test-vars-v25.11:
|
||||||
|
|
||||||
Key options
|
Key options
|
||||||
-----------
|
-----------
|
||||||
|
|||||||
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v26.1 (latest)
|
* - v25.11
|
||||||
-
|
-
|
||||||
* ROCm 7.1.0
|
* ROCm 7.1.0
|
||||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||||
-
|
-
|
||||||
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
* :doc:`Primus Megatron documentation <../primus-megatron>`
|
||||||
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||||
|
|
||||||
* - v25.11
|
|
||||||
-
|
|
||||||
* ROCm 7.1.0
|
|
||||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
|
||||||
-
|
|
||||||
* :doc:`Primus Megatron documentation <primus-megatron-v25.11>`
|
|
||||||
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.10>`
|
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3>`__
|
|
||||||
|
|
||||||
* - v25.10
|
* - v25.10
|
||||||
-
|
-
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ GPUs containing essential components, including PyTorch, ROCm libraries, and
|
|||||||
Megatron-LM utilities. It contains the following software components to
|
Megatron-LM utilities. It contains the following software components to
|
||||||
accelerate training workloads:
|
accelerate training workloads:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ image.
|
|||||||
Download the Docker image
|
Download the Docker image
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
{% set docker = data.docker %}
|
||||||
1. Use the following command to pull the Docker image from Docker Hub.
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
@@ -811,7 +811,7 @@ Single node training
|
|||||||
Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
|
Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
|
||||||
for large iterations.
|
for large iterations.
|
||||||
For stability, it's recommended to use Primus for this workload.
|
For stability, it's recommended to use Primus for this workload.
|
||||||
See :doc:`../primus-megatron`.
|
See :doc:`primus-megatron`.
|
||||||
|
|
||||||
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
|
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -25,10 +25,10 @@ model training. Performance acceleration is powered by `Primus Turbo
|
|||||||
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
<https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
|
||||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||||
including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`.
|
including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
|
||||||
|
|
||||||
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
|
Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
|
||||||
training <../megatron-lm>` workflow. To learn how to migrate workloads from
|
training <megatron-lm>` workflow. To learn how to migrate workloads from
|
||||||
Megatron-LM to Primus with Megatron, see
|
Megatron-LM to Primus with Megatron, see
|
||||||
:doc:`megatron-lm-primus-migration-guide`.
|
:doc:`megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ AMD provides a ready-to-use Docker images for MI355X, MI350X,
|
|||||||
MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and
|
MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and
|
||||||
Megatron-LM.
|
Megatron-LM.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ The following models are pre-optimized for performance on AMD Instinct GPUs.
|
|||||||
Some instructions, commands, and training examples in this documentation
|
Some instructions, commands, and training examples in this documentation
|
||||||
might vary by model -- select one to get started.
|
might vary by model -- select one to get started.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
@@ -120,7 +120,7 @@ system's configuration.
|
|||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
Use the following instructions to set up the environment, configure the script to train models, and
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
reproduce the benchmark results on AMD Instinct GPUs.
|
reproduce the benchmark results on AMD Instinct GPUs.
|
||||||
@@ -129,7 +129,7 @@ Environment setup
|
|||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
{% set docker = data.docker %}
|
||||||
|
|
||||||
@@ -175,7 +175,7 @@ Configuration
|
|||||||
Primus defines a training configuration in YAML for each model in
|
Primus defines a training configuration in YAML for each model in
|
||||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
@@ -805,7 +805,7 @@ To run training on multiple nodes, you can use the
|
|||||||
`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
|
`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
|
||||||
to launch the multi-node workload. Use the following steps to setup your environment:
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
{% set docker = data.docker %}
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -24,17 +24,17 @@ Primus now supports the PyTorch torchtitan backend.
|
|||||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
||||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
||||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
||||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
|
||||||
|
|
||||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
Primus with the PyTorch torchtitan backend is designed to replace the
|
||||||
:doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
|
:doc:`ROCm PyTorch training <pytorch-training>` workflow. See
|
||||||
:doc:`../pytorch-training` to see steps to run workloads without Primus.
|
:doc:`pytorch-training` to see steps to run workloads without Primus.
|
||||||
|
|
||||||
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
||||||
MI300X GPUs containing essential components for Primus and PyTorch training
|
MI300X GPUs containing essential components for Primus and PyTorch training
|
||||||
with Primus Turbo optimizations.
|
with Primus Turbo optimizations.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
@@ -61,7 +61,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
@@ -96,7 +96,7 @@ vary by model -- select one to get started.
|
|||||||
.. seealso::
|
.. seealso::
|
||||||
|
|
||||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||||
see the documentation :doc:`../pytorch-training` (without Primus)
|
see the documentation :doc:`pytorch-training` (without Primus)
|
||||||
|
|
||||||
.. _amd-primus-pytorch-performance-measurements-v2510:
|
.. _amd-primus-pytorch-performance-measurements-v2510:
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ doesn’t test configurations and run conditions outside those described.
|
|||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||||
|
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
@@ -134,11 +134,11 @@ Run training
|
|||||||
============
|
============
|
||||||
|
|
||||||
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
||||||
For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
|
For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
|
||||||
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
||||||
tweak some configurations (such as batch sizes).
|
tweak some configurations (such as batch sizes).
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
{% set docker = data.docker %}
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|||||||
@@ -1,422 +0,0 @@
|
|||||||
:orphan:
|
|
||||||
|
|
||||||
.. meta::
|
|
||||||
:description: How to train a model using PyTorch for ROCm.
|
|
||||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
|
||||||
|
|
||||||
****************************************
|
|
||||||
Training a model with Primus and PyTorch
|
|
||||||
****************************************
|
|
||||||
|
|
||||||
.. caution::
|
|
||||||
|
|
||||||
This documentation does not reflect the latest version of ROCm Primus PyTorch training
|
|
||||||
performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
|
|
||||||
|
|
||||||
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
|
|
||||||
LLM training framework designed to streamline training. It streamlines LLM
|
|
||||||
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
|
|
||||||
Primus now supports the PyTorch torchtitan backend.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
|
||||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
|
||||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
|
||||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
|
||||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
|
||||||
|
|
||||||
Primus with the PyTorch torchtitan backend is designed to replace the
|
|
||||||
:doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
|
|
||||||
:doc:`../pytorch-training` to see steps to run workloads without Primus.
|
|
||||||
|
|
||||||
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
|
|
||||||
MI300X GPUs containing essential components for Primus and PyTorch training
|
|
||||||
with Primus Turbo optimizations.
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: {{ data.docker.pull_tag }}
|
|
||||||
:sync: {{ data.docker.pull_tag }}
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in data.docker.components.items() %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. _amd-primus-pytorch-model-support-v25.11:
|
|
||||||
|
|
||||||
Supported models
|
|
||||||
================
|
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
|
|
||||||
Some instructions, commands, and training recommendations in this documentation might
|
|
||||||
vary by model -- select one to get started.
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
|
||||||
.. raw:: html
|
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
|
||||||
<div class="row gx-0">
|
|
||||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10 pe-0">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row gx-0 pt-1">
|
|
||||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
|
||||||
<div class="row col-10 pe-0">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
.. seealso::
|
|
||||||
|
|
||||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
|
||||||
see the documentation :doc:`../pytorch-training` (without Primus)
|
|
||||||
|
|
||||||
.. _amd-primus-pytorch-performance-measurements-v25.11:
|
|
||||||
|
|
||||||
System validation
|
|
||||||
=================
|
|
||||||
|
|
||||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
|
||||||
correctly and performing optimally.
|
|
||||||
|
|
||||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
|
||||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
|
||||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
|
||||||
before starting training.
|
|
||||||
|
|
||||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
|
||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
|
||||||
system's configuration.
|
|
||||||
|
|
||||||
This Docker image is optimized for specific model configurations outlined
|
|
||||||
below. Performance can vary for other training workloads, as AMD
|
|
||||||
doesn’t test configurations and run conditions outside those described.
|
|
||||||
|
|
||||||
Pull the Docker image
|
|
||||||
=====================
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker pull {{ data.docker.pull_tag }}
|
|
||||||
|
|
||||||
Run training
|
|
||||||
============
|
|
||||||
|
|
||||||
Once the setup is complete, choose between the following two workflows to start benchmarking training.
|
|
||||||
For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
|
|
||||||
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
|
|
||||||
tweak some configurations (such as batch sizes).
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
|
||||||
{% set model_groups = data.model_groups %}
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% for model in model_group.models %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
The following run command is tailored to {{ model.model }}.
|
|
||||||
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
|
||||||
directory and install the required packages on the host machine.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
|
||||||
cd MAD
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
|
||||||
using one node with the {{ model.precision }} data type on the host machine.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
|
||||||
madengine run \
|
|
||||||
--tags {{ model.mad_tag }} \
|
|
||||||
--keep-model-dir \
|
|
||||||
--live-output \
|
|
||||||
--timeout 28800
|
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
|
||||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
|
||||||
model are collected in ``~/MAD/perf.csv``.
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. tab-item:: Primus benchmarking
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% for model in model_group.models %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
The following run commands are tailored to {{ model.model }}.
|
|
||||||
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
|
||||||
|
|
||||||
.. rubric:: Download the Docker image and required packages
|
|
||||||
|
|
||||||
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker pull {{ docker.pull_tag }}
|
|
||||||
|
|
||||||
2. Run the Docker container.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker run -it \
|
|
||||||
--device /dev/dri \
|
|
||||||
--device /dev/kfd \
|
|
||||||
--network host \
|
|
||||||
--ipc host \
|
|
||||||
--group-add video \
|
|
||||||
--cap-add SYS_PTRACE \
|
|
||||||
--security-opt seccomp=unconfined \
|
|
||||||
--privileged \
|
|
||||||
-v $HOME:$HOME \
|
|
||||||
-v $HOME/.ssh:/root/.ssh \
|
|
||||||
--shm-size 64G \
|
|
||||||
--name training_env \
|
|
||||||
{{ docker.pull_tag }}
|
|
||||||
|
|
||||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker start training_env
|
|
||||||
docker exec -it training_env bash
|
|
||||||
|
|
||||||
The Docker container hosts verified commit ``c4c083de`` of the `Primus
|
|
||||||
<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
|
|
||||||
|
|
||||||
.. rubric:: Prepare training datasets and dependencies
|
|
||||||
|
|
||||||
The following benchmarking examples require downloading models and datasets
|
|
||||||
from Hugging Face. To ensure successful access to gated repos, set your
|
|
||||||
``HF_TOKEN``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
|
||||||
|
|
||||||
.. rubric:: Pretraining
|
|
||||||
|
|
||||||
To get started, navigate to the ``Primus`` directory in your container.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
cd /workspace/Primus
|
|
||||||
|
|
||||||
Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
|
|
||||||
included with Primus with the appropriate options.
|
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_train_llama-3.1-8b
|
|
||||||
|
|
||||||
Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MI355X and MI350X
|
|
||||||
:sync: MI355X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
|
||||||
:sync: MI325X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
|
||||||
:sync: MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MI355X and MI350X
|
|
||||||
:sync: MI355X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
|
||||||
:sync: MI325X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh --training.local_batch_size 7
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
|
||||||
:sync: MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
|
||||||
|
|
||||||
Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MI355X and MI350X
|
|
||||||
:sync: MI355X and MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
|
||||||
:sync: MI325X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh --training.local_batch_size 6
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
|
||||||
:sync: MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MI355X and MI350X
|
|
||||||
:sync: MI355X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
|
||||||
:sync: MI325X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh --training.local_batch_size 5
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
|
||||||
:sync: MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_train_deepseek-v3-16b
|
|
||||||
|
|
||||||
Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan.
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MI355X and MI350X
|
|
||||||
:sync: MI355X and MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
|
||||||
:sync: MI325X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh --training.local_batch_size 10
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
|
||||||
:sync: MI300X
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
|
||||||
bash examples/run_pretrain.sh
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
Further reading
|
|
||||||
===============
|
|
||||||
|
|
||||||
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
|
|
||||||
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
|
|
||||||
|
|
||||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
|
||||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
|
||||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
|
||||||
|
|
||||||
Previous versions
|
|
||||||
=================
|
|
||||||
|
|
||||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
|
||||||
of the ``ROCm/pytorch-training`` Docker image.
|
|
||||||
@@ -16,30 +16,21 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v26.1 (latest)
|
|
||||||
-
|
|
||||||
* ROCm 7.1.0
|
|
||||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
|
||||||
-
|
|
||||||
* :doc:`Primus PyTorch training documentation <../primus-megatron>`
|
|
||||||
* :doc:`PyTorch training (legacy) documentation <../megatron-lm>`
|
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d>`__
|
|
||||||
|
|
||||||
* - v25.11
|
* - v25.11
|
||||||
-
|
-
|
||||||
* ROCm 7.1.0
|
* ROCm 7.1.0
|
||||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||||
-
|
-
|
||||||
* :doc:`Primus PyTorch training documentation <primus-pytorch-v25.11>`
|
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
|
||||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.11>`
|
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||||
|
|
||||||
* - v25.10
|
* - v25.10
|
||||||
-
|
-
|
||||||
* ROCm 7.1.0
|
* ROCm 7.1.0
|
||||||
* PyTorch 2.10.0.dev20251112+rocm7.1
|
* PyTorch 2.10.0.dev20251112+rocm7.1
|
||||||
-
|
-
|
||||||
* :doc:`Primus PyTorch training documentation <primus-pytorch-v25.10>`
|
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.10>`
|
||||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
|
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
|
||||||
|
|
||||||
@@ -49,7 +40,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
* Primus 0.3.0
|
* Primus 0.3.0
|
||||||
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
|
||||||
-
|
-
|
||||||
* :doc:`Primus PyTorch training documentation <primus-pytorch-v25.9>`
|
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
|
||||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
|
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
|
||||||
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
|
||||||
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
|
||||||
@@ -59,7 +50,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
* ROCm 6.4.3
|
* ROCm 6.4.3
|
||||||
* PyTorch 2.8.0a0+gitd06a406
|
* PyTorch 2.8.0a0+gitd06a406
|
||||||
-
|
-
|
||||||
* :doc:`Primus PyTorch training documentation <primus-pytorch-v25.8>`
|
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.8>`
|
||||||
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.8>`
|
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.8>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5>`__
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ environment for fine-tuning and pretraining a model on AMD Instinct MI325X
|
|||||||
and MI300X GPUs. It includes the following software components to accelerate
|
and MI300X GPUs. It includes the following software components to accelerate
|
||||||
training workloads:
|
training workloads:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
|
|||||||
training recommendations in this documentation might vary by model -- select
|
training recommendations in this documentation might vary by model -- select
|
||||||
one to get started.
|
one to get started.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
@@ -94,7 +94,7 @@ one to get started.
|
|||||||
|
|
||||||
The following table lists supported training modes per model.
|
The following table lists supported training modes per model.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. dropdown:: Supported training modes
|
.. dropdown:: Supported training modes
|
||||||
@@ -164,7 +164,7 @@ doesn’t test configurations and run conditions outside those described.
|
|||||||
Run training
|
Run training
|
||||||
============
|
============
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
{% set docker = data.docker %}
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|||||||
@@ -1,669 +0,0 @@
|
|||||||
:orphan:
|
|
||||||
|
|
||||||
.. meta::
|
|
||||||
:description: How to train a model using PyTorch for ROCm.
|
|
||||||
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
|
||||||
|
|
||||||
**************************************
|
|
||||||
Training a model with PyTorch on ROCm
|
|
||||||
**************************************
|
|
||||||
|
|
||||||
.. caution::
|
|
||||||
|
|
||||||
This documentation does not reflect the latest version of ROCm PyTorch training
|
|
||||||
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
|
|
||||||
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
|
|
||||||
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
|
|
||||||
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
|
|
||||||
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
|
|
||||||
|
|
||||||
See :doc:`../primus-pytorch` for details.
|
|
||||||
|
|
||||||
PyTorch is an open-source machine learning framework that is widely used for
|
|
||||||
model training with GPU-optimized components for transformer-based models.
|
|
||||||
The PyTorch for ROCm training Docker image provides a prebuilt optimized
|
|
||||||
environment for fine-tuning and pretraining a model on AMD Instinct MI325X
|
|
||||||
and MI300X GPUs. It includes the following software components to accelerate
|
|
||||||
training workloads:
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: {{ data.docker.pull_tag }}
|
|
||||||
:sync: {{ data.docker.pull_tag }}
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in data.docker.components.items() %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support-v25.11:
|
|
||||||
|
|
||||||
Supported models
|
|
||||||
================
|
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct
|
|
||||||
MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
|
|
||||||
training recommendations in this documentation might vary by model -- select
|
|
||||||
one to get started.
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
|
||||||
.. raw:: html
|
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
|
||||||
<div class="row gx-0">
|
|
||||||
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10 pe-0">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row gx-0 pt-1">
|
|
||||||
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
|
||||||
<div class="row col-10 pe-0">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
.. _amd-pytorch-training-supported-training-modes-v25.11:
|
|
||||||
|
|
||||||
The following table lists supported training modes per model.
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
|
||||||
.. dropdown:: Supported training modes
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Model
|
|
||||||
- Supported training modes
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if model.training_modes %}
|
|
||||||
* - {{ model.model }}
|
|
||||||
- ``{{ model.training_modes | join('``, ``') }}``
|
|
||||||
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Some model and fine-tuning combinations are not listed. This is
|
|
||||||
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
|
||||||
doesn't provide default YAML configurations for them.
|
|
||||||
For advanced usage, you can create a custom configuration to enable
|
|
||||||
unlisted fine-tuning methods by using an existing file in the
|
|
||||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements-v25.11:
|
|
||||||
|
|
||||||
Performance measurements
|
|
||||||
========================
|
|
||||||
|
|
||||||
To evaluate performance, the
|
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
|
||||||
page provides reference throughput and latency measurements for training
|
|
||||||
popular AI models.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
The performance data presented in
|
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
|
||||||
should not be interpreted as the peak performance achievable by AMD
|
|
||||||
Instinct MI325X and MI300X GPUs or ROCm software.
|
|
||||||
|
|
||||||
System validation
|
|
||||||
=================
|
|
||||||
|
|
||||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
|
||||||
correctly and performing optimally.
|
|
||||||
|
|
||||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
|
||||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
|
||||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
|
||||||
before starting training.
|
|
||||||
|
|
||||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
|
||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
|
||||||
system's configuration.
|
|
||||||
|
|
||||||
This Docker image is optimized for specific model configurations outlined
|
|
||||||
below. Performance can vary for other training workloads, as AMD
|
|
||||||
doesn’t test configurations and run conditions outside those described.
|
|
||||||
|
|
||||||
Run training
|
|
||||||
============
|
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
|
|
||||||
|
|
||||||
{% set docker = data.docker %}
|
|
||||||
{% set model_groups = data.model_groups %}
|
|
||||||
|
|
||||||
Once the setup is complete, choose between two options to start benchmarking training:
|
|
||||||
|
|
||||||
.. tab-set::
|
|
||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% for model in model_group.models %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
The following run command is tailored to {{ model.model }}.
|
|
||||||
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
|
||||||
directory and install the required packages on the host machine.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
|
||||||
cd MAD
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
|
||||||
using one node with the {{ model.precision }} data type on the host machine.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
|
||||||
madengine run \
|
|
||||||
--tags {{ model.mad_tag }} \
|
|
||||||
--keep-model-dir \
|
|
||||||
--live-output \
|
|
||||||
--timeout 28800
|
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
|
||||||
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
|
||||||
model are collected in ``~/MAD/perf.csv``.
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. tab-item:: Standalone benchmarking
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% for model in model_group.models %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
The following commands are tailored to {{ model.model }}.
|
|
||||||
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. rubric:: Download the Docker image and required packages
|
|
||||||
|
|
||||||
1. Use the following command to pull the Docker image from Docker Hub.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker pull {{ docker.pull_tag }}
|
|
||||||
|
|
||||||
2. Launch the Docker container.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker run -it \
|
|
||||||
--device /dev/dri \
|
|
||||||
--device /dev/kfd \
|
|
||||||
--network host \
|
|
||||||
--ipc host \
|
|
||||||
--group-add video \
|
|
||||||
--cap-add SYS_PTRACE \
|
|
||||||
--security-opt seccomp=unconfined \
|
|
||||||
--privileged \
|
|
||||||
-v $HOME:$HOME \
|
|
||||||
-v $HOME/.ssh:/root/.ssh \
|
|
||||||
--shm-size 64G \
|
|
||||||
--name training_env \
|
|
||||||
{{ docker.pull_tag }}
|
|
||||||
|
|
||||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
docker start training_env
|
|
||||||
docker exec -it training_env bash
|
|
||||||
|
|
||||||
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
|
||||||
repository and navigate to the benchmark scripts directory
|
|
||||||
``/workspace/MAD/scripts/pytorch_train``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
|
||||||
cd MAD/scripts/pytorch_train
|
|
||||||
|
|
||||||
.. rubric:: Prepare training datasets and dependencies
|
|
||||||
|
|
||||||
1. The following benchmarking examples require downloading models and datasets
|
|
||||||
from Hugging Face. To ensure successful access to gated repos, set your
|
|
||||||
``HF_TOKEN``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
|
||||||
|
|
||||||
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_setup.sh
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Library
|
|
||||||
- Reference
|
|
||||||
|
|
||||||
* - ``accelerate``
|
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
|
||||||
|
|
||||||
* - ``datasets``
|
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Library
|
|
||||||
- Reference
|
|
||||||
|
|
||||||
* - ``datasets``
|
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
|
||||||
|
|
||||||
* - ``torchdata``
|
|
||||||
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
|
|
||||||
|
|
||||||
* - ``tomli``
|
|
||||||
- `Tomli <https://pypi.org/project/tomli/>`__
|
|
||||||
|
|
||||||
* - ``tiktoken``
|
|
||||||
- `tiktoken <https://github.com/openai/tiktoken>`__
|
|
||||||
|
|
||||||
* - ``blobfile``
|
|
||||||
- `blobfile <https://pypi.org/project/blobfile/>`__
|
|
||||||
|
|
||||||
* - ``tabulate``
|
|
||||||
- `tabulate <https://pypi.org/project/tabulate/>`__
|
|
||||||
|
|
||||||
* - ``wandb``
|
|
||||||
- `Weights & Biases <https://github.com/wandb/wandb>`__
|
|
||||||
|
|
||||||
* - ``sentencepiece``
|
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
|
||||||
|
|
||||||
* - ``tensorboard``
|
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_flux
|
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Library
|
|
||||||
- Reference
|
|
||||||
|
|
||||||
* - ``accelerate``
|
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
|
||||||
|
|
||||||
* - ``datasets``
|
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
|
|
||||||
|
|
||||||
* - ``sentencepiece``
|
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
|
|
||||||
|
|
||||||
* - ``tensorboard``
|
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
|
|
||||||
|
|
||||||
* - ``csvkit``
|
|
||||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
|
|
||||||
|
|
||||||
* - ``deepspeed``
|
|
||||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
|
|
||||||
|
|
||||||
* - ``diffusers``
|
|
||||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
|
|
||||||
|
|
||||||
* - ``GitPython``
|
|
||||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
|
|
||||||
|
|
||||||
* - ``opencv-python-headless``
|
|
||||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
|
|
||||||
|
|
||||||
* - ``peft``
|
|
||||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
|
|
||||||
|
|
||||||
* - ``protobuf``
|
|
||||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
|
|
||||||
|
|
||||||
* - ``pytest``
|
|
||||||
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
|
|
||||||
|
|
||||||
* - ``python-dotenv``
|
|
||||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
|
|
||||||
|
|
||||||
* - ``seaborn``
|
|
||||||
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
|
|
||||||
|
|
||||||
* - ``transformers``
|
|
||||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
|
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
|
||||||
|
|
||||||
* `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
|
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% for model in model_group.models %}
|
|
||||||
{% set training_modes = model.training_modes %}
|
|
||||||
{% set training_mode_descs = {
|
|
||||||
"pretrain": "Benchmark pre-training.",
|
|
||||||
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
|
||||||
} %}
|
|
||||||
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
|
||||||
{% if available_modes %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. rubric:: Pretraining
|
|
||||||
|
|
||||||
To start the pre-training benchmark, use the following command with the
|
|
||||||
appropriate options. See the following list of options and their descriptions.
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_dlrm" %}
|
|
||||||
|
|
||||||
1. Go to the DLRM directory.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
cd /workspace/DLRMBenchmark
|
|
||||||
|
|
||||||
2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
|
|
||||||
run the following script.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./launch_training_single_node.sh
|
|
||||||
|
|
||||||
To run with MAD within the Docker container, use the following command.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t pretrain -m DLRM
|
|
||||||
|
|
||||||
{% else %}
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
|
||||||
-m {{ model.model_repo }} \
|
|
||||||
-p $datatype \
|
|
||||||
-s $sequence_length
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_flux" %}
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Currently, FLUX models are not supported out-of-the-box on this Docker.
|
|
||||||
To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
|
|
||||||
|
|
||||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
|
||||||
error, manually download it from Hugging Face at
|
|
||||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
|
||||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
|
||||||
the required dataset.
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
{% for mode in available_modes %}
|
|
||||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
|
||||||
- ``{{ mode }}``
|
|
||||||
- {{ training_mode_descs[mode] }}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
|
|
||||||
* - ``$sequence_length``
|
|
||||||
- Sequence length for the language model.
|
|
||||||
- Between 2048 and 8192. 8192 by default.
|
|
||||||
{% endif %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% set training_modes = model.training_modes %}
|
|
||||||
{% set training_mode_descs = {
|
|
||||||
"posttrain": "Benchmark post-training.",
|
|
||||||
} %}
|
|
||||||
{% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
|
|
||||||
{% if available_modes %}
|
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. rubric:: Post-training
|
|
||||||
|
|
||||||
To start the post-training benchmark, use the following command with the
|
|
||||||
appropriate options. See the following list of options and their descriptions.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
|
||||||
-m {{ model.model_repo }} \
|
|
||||||
-p $datatype \
|
|
||||||
-s $sequence_length
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
{% for mode in available_modes %}
|
|
||||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
|
||||||
- ``{{ mode }}``
|
|
||||||
- {{ training_mode_descs[mode] }}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
|
|
||||||
* - ``$sequence_length``
|
|
||||||
- Sequence length for the language model.
|
|
||||||
- Between 2048 and 8192. 8192 by default.
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% set training_mode_descs = {
|
|
||||||
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
|
||||||
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
|
||||||
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
|
||||||
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
|
||||||
} %}
|
|
||||||
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
|
||||||
{% if available_modes %}
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. rubric:: Fine-tuning
|
|
||||||
|
|
||||||
To start the fine-tuning benchmark, use the following command with the
|
|
||||||
appropriate options. See the following list of options and their descriptions.
|
|
||||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t $training_mode \
|
|
||||||
-m {{ model.model_repo }} \
|
|
||||||
-p $datatype \
|
|
||||||
-s $sequence_length
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
{% for mode in available_modes %}
|
|
||||||
* - {% if loop.first %}``$training_mode``{% endif %}
|
|
||||||
- ``{{ mode }}``
|
|
||||||
- {{ training_mode_descs[mode] }}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
|
||||||
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
|
||||||
|
|
||||||
* - ``$sequence_length``
|
|
||||||
- Between 2048 and 16384.
|
|
||||||
- Sequence length for the language model.
|
|
||||||
|
|
||||||
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
|
||||||
use the following torchtune commit for compatibility:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
|
||||||
|
|
||||||
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
|
||||||
input tensor should be smaller than max_seq_len (4096)``.
|
|
||||||
This error indicates that an input sequence is longer than the model's maximum context window.
|
|
||||||
|
|
||||||
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
|
||||||
tokens in this case). You can resolve this by truncating the input or splitting
|
|
||||||
it into smaller chunks before passing it to the model.
|
|
||||||
|
|
||||||
Note on reproducibility: The results in this guide are based on
|
|
||||||
commit ``b4c98ac`` from the upstream
|
|
||||||
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
|
||||||
latest updates, you can use the main branch.
|
|
||||||
|
|
||||||
{% endif %}
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
|
||||||
|
|
||||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
|
||||||
|
|
||||||
.. _amd-pytorch-training-multinode-examples-v25.11:
|
|
||||||
|
|
||||||
Multi-node training
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
|
|
||||||
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
|
|
||||||
|
|
||||||
Pre-training
|
|
||||||
~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
|
||||||
|
|
||||||
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
# In the MAD repository
|
|
||||||
cd scripts/pytorch_train
|
|
||||||
sbatch run_slurm_train.sh
|
|
||||||
|
|
||||||
Fine-tuning
|
|
||||||
~~~~~~~~~~~
|
|
||||||
|
|
||||||
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
|
||||||
|
|
||||||
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
huggingface-cli login # Get access to HF Llama model space
|
|
||||||
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
|
||||||
# In the MAD repository
|
|
||||||
cd scripts/pytorch_train
|
|
||||||
sbatch Torchtune_Multinode.sh
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Information regarding benchmark setup:
|
|
||||||
|
|
||||||
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
|
||||||
* You can adjust the torchtune `YAML configuration file
|
|
||||||
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
|
||||||
if you're using a different model.
|
|
||||||
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
|
||||||
* Set the ``mounting_paths`` inside the SLURM script.
|
|
||||||
|
|
||||||
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
|
||||||
|
|
||||||
Further reading
|
|
||||||
===============
|
|
||||||
|
|
||||||
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
|
||||||
|
|
||||||
- To learn more about system settings and management practices to configure your system for
|
|
||||||
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
|
||||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
|
||||||
|
|
||||||
Previous versions
|
|
||||||
=================
|
|
||||||
|
|
||||||
See :doc:`pytorch-training-history` to find documentation for previous releases
|
|
||||||
of the ``ROCm/pytorch-training`` Docker image.
|
|
||||||
@@ -47,7 +47,7 @@ Megatron-LM.
|
|||||||
- {{ component_version }}
|
- {{ component_version }}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-model-support-v26.01:
|
.. _amd-primus-megatron-lm-model-support-v25.11:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -108,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
|
|||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
system's configuration.
|
system's configuration.
|
||||||
|
|
||||||
.. _mi300x-amd-primus-megatron-lm-training-v26.01:
|
.. _mi300x-amd-primus-megatron-lm-training-v25.11:
|
||||||
|
|
||||||
Environment setup
|
Environment setup
|
||||||
=================
|
=================
|
||||||
@@ -118,7 +118,7 @@ Environment setup
|
|||||||
Use the following instructions to set up the environment, configure the script to train models, and
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
reproduce the benchmark results on AMD Instinct GPUs.
|
reproduce the benchmark results on AMD Instinct GPUs.
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-requirements-v26.01:
|
.. _amd-primus-megatron-lm-requirements-v25.11:
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
|
|
||||||
@@ -157,16 +157,16 @@ Pull the Docker image
|
|||||||
docker start primus_training_env
|
docker start primus_training_env
|
||||||
docker exec -it primus_training_env bash
|
docker exec -it primus_training_env bash
|
||||||
|
|
||||||
The Docker container hosts verified commit ``9c529cd4`` of the `Primus
|
The Docker container hosts verified commit ``c4c083de`` of the `Primus
|
||||||
<https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167>`__ repository.
|
<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-environment-setup-v26.01:
|
.. _amd-primus-megatron-lm-environment-setup-v25.11:
|
||||||
|
|
||||||
Configuration
|
Configuration
|
||||||
=============
|
=============
|
||||||
|
|
||||||
Primus defines a training configuration in YAML for each model in
|
Primus defines a training configuration in YAML for each model in
|
||||||
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167/examples/megatron/configs>`__.
|
`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/examples/megatron/configs>`__.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -207,7 +207,7 @@ You can use either mock data or real data for training.
|
|||||||
|
|
||||||
Ensure that the files are accessible inside the Docker container.
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-tokenizer-v26.01:
|
.. _amd-primus-megatron-lm-tokenizer-v25.11:
|
||||||
|
|
||||||
Tokenizer
|
Tokenizer
|
||||||
---------
|
---------
|
||||||
@@ -220,7 +220,15 @@ right permissions to access the tokenizer for each model.
|
|||||||
# Export your HF_TOKEN in the workspace
|
# Export your HF_TOKEN in the workspace
|
||||||
export HF_TOKEN=<your_hftoken>
|
export HF_TOKEN=<your_hftoken>
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-run-training-v26.01:
|
.. note::
|
||||||
|
|
||||||
|
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||||
|
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||||
|
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||||
|
<https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||||
|
definition.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-run-training-v25.11:
|
||||||
|
|
||||||
Run training
|
Run training
|
||||||
============
|
============
|
||||||
@@ -244,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.3 70B.
|
The following run commands are tailored to Llama 3.3 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run pre-training for Llama 3.3 70B BF16, run:
|
To run pre-training for Llama 3.3 70B BF16, run:
|
||||||
|
|
||||||
@@ -255,10 +263,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.3_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -270,16 +276,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.3_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.1 8B.
|
The following run commands are tailored to Llama 3.1 8B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run pre-training for Llama 3.1 8B FP8, run:
|
To run pre-training for Llama 3.1 8B FP8, run:
|
||||||
|
|
||||||
@@ -290,10 +294,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B_fp8.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -305,10 +307,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B_fp8.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
For Llama 3.1 8B BF16, use the following command:
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
@@ -319,10 +319,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -334,16 +332,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.1 70B.
|
The following run commands are tailored to Llama 3.1 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run pre-training for Llama 3.1 70B BF16, run:
|
To run pre-training for Llama 3.1 70B BF16, run:
|
||||||
|
|
||||||
@@ -354,10 +350,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -369,10 +363,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
To run the training on a single node for Llama 3.1 70B FP8, use the following command.
|
To run the training on a single node for Llama 3.1 70B FP8, use the following command.
|
||||||
|
|
||||||
@@ -389,10 +381,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B_fp8.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -404,10 +394,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B_fp8_proxy.log \
|
bash ./examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
|
||||||
--train_iters 50 \
|
--train_iters 50 \
|
||||||
--num_layers 40 \
|
--num_layers 40 \
|
||||||
--fp8 hybrid \
|
--fp8 hybrid \
|
||||||
@@ -417,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 7B.
|
The following run commands are tailored to Llama 2 7B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run pre-training for Llama 2 7B FP8, run:
|
To run pre-training for Llama 2 7B FP8, run:
|
||||||
|
|
||||||
@@ -428,10 +416,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_7B_fp8.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -443,10 +429,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_7B_fp8.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
To run pre-training for Llama 2 7B BF16, run:
|
To run pre-training for Llama 2 7B BF16, run:
|
||||||
|
|
||||||
@@ -457,10 +441,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_7B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -472,16 +454,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_7B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 70B.
|
The following run commands are tailored to Llama 2 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run pre-training for Llama 2 70B BF16, run:
|
To run pre-training for Llama 2 70B BF16, run:
|
||||||
|
|
||||||
@@ -492,10 +472,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -507,16 +485,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama2_70B.log \
|
bash ./examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to DeepSeek-V3.
|
The following run commands are tailored to DeepSeek-V3.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
|
||||||
use the following command:
|
use the following command:
|
||||||
@@ -528,10 +504,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v3_proxy.log \
|
bash examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
|
|
||||||
--num_layers 3 \
|
--num_layers 3 \
|
||||||
--moe_layer_freq 1 \
|
--moe_layer_freq 1 \
|
||||||
--train_iters 50 \
|
--train_iters 50 \
|
||||||
@@ -548,21 +522,17 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v3_proxy.log \
|
bash examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
|
|
||||||
--num_layers 3 \
|
--num_layers 3 \
|
||||||
--moe_layer_freq 1 \
|
--moe_layer_freq 1 \
|
||||||
--micro_batch_size 3 \
|
|
||||||
--global_batch_size 192 \
|
|
||||||
--train_iters 50
|
--train_iters 50
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to DeepSeek-V2-Lite.
|
The following run commands are tailored to DeepSeek-V2-Lite.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
|
||||||
use the following command:
|
use the following command:
|
||||||
@@ -574,10 +544,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v2_lite.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs//MI355X/deepseek_v2_lite-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -589,16 +557,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v2_lite.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Mixtral 8x7B.
|
The following run commands are tailored to Mixtral 8x7B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
use the following command:
|
use the following command:
|
||||||
@@ -610,10 +576,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_mixtral_8x7B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -625,16 +589,15 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_mixtral_8x7B.log \
|
bash examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
--train_iters 50
|
||||||
--config examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Mixtral 8x22B.
|
The following run commands are tailored to Mixtral 8x22B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
|
To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
|
||||||
use the following command:
|
use the following command:
|
||||||
@@ -646,10 +609,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_mixtral_8x22B_proxy.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -661,21 +622,19 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_mixtral_8x22B_proxy.log \
|
bash examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
--train_iters 50 \
|
||||||
--config examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
|
|
||||||
--num_layers 4 \
|
--num_layers 4 \
|
||||||
--pipeline_model_parallel_size 1 \
|
--pipeline_model_parallel_size 1 \
|
||||||
--micro_batch_size 1 \
|
--micro_batch_size 1 \
|
||||||
--global_batch_size 16 \
|
--global_batch_size 16
|
||||||
--train_iters 50
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Qwen 2.5 7B.
|
The following run commands are tailored to Qwen 2.5 7B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
command:
|
command:
|
||||||
@@ -687,10 +646,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_7B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -702,10 +659,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_7B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
For FP8, use the following command.
|
For FP8, use the following command.
|
||||||
|
|
||||||
@@ -716,10 +671,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_7B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -731,16 +684,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_7B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Qwen 2.5 72B.
|
The following run commands are tailored to Qwen 2.5 72B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
@@ -751,10 +702,11 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_72B.log \
|
bash examples/run_pretrain.sh \
|
||||||
-- train pretrain \
|
--train_iters 50 \
|
||||||
--config examples/megatron/configs/MI355X/qwen2.5_72B-BF16-pretrain.yaml
|
--micro_batch_size 16 \
|
||||||
|
--global_batch_size 256
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI325X and MI300X
|
:sync: MI325X and MI300X
|
||||||
@@ -766,12 +718,10 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
|
|||||||
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
|
||||||
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
export NVTE_CK_IS_V3_ATOMIC_FP32=1
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_qwen2.5_72B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. _amd-primus-megatron-multi-node-examples-v26.01:
|
.. _amd-primus-megatron-multi-node-examples-v25.11:
|
||||||
|
|
||||||
Multi-node training examples
|
Multi-node training examples
|
||||||
----------------------------
|
----------------------------
|
||||||
@@ -780,7 +730,7 @@ Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure y
|
|||||||
training.
|
training.
|
||||||
|
|
||||||
To run training on multiple nodes, you can use the
|
To run training on multiple nodes, you can use the
|
||||||
`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/9c529cd4a934a68a880ede036c3e97b792e38167/examples/run_slurm_pretrain.sh>`__
|
`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
|
||||||
to launch the multi-node workload. Use the following steps to setup your environment:
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
@@ -813,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||||
* To find your network interface, you can use ``ip a``.
|
* To find your network interface, you can use ``ip a``.
|
||||||
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||||
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v26.01`) as appropriate.
|
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate.
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.1 8B.
|
The following run commands are tailored to Llama 3.1 8B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -836,7 +786,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 7B.
|
The following run commands are tailored to Llama 2 7B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Llama 2 7B FP8 on 8 nodes, run:
|
To train Llama 2 7B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -853,7 +803,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.1 70B.
|
The following run commands are tailored to Llama 3.1 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -883,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 70B.
|
The following run commands are tailored to Llama 2 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Llama 2 70B FP8 on 8 nodes, run:
|
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -913,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 3.3 70B.
|
The following run commands are tailored to Llama 3.3 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -943,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 70B.
|
The following run commands are tailored to Llama 2 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -961,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
|
|
||||||
Once setup is complete, run the appropriate training command.
|
Once setup is complete, run the appropriate training command.
|
||||||
The following run commands are tailored to Llama 2 70B.
|
The following run commands are tailored to Llama 2 70B.
|
||||||
See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
@@ -976,7 +926,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
|
|||||||
--global_batch_size 512 \
|
--global_batch_size 512 \
|
||||||
--recompute_num_layers 80 \
|
--recompute_num_layers 80 \
|
||||||
|
|
||||||
.. _amd-primus-megatron-lm-benchmark-test-vars-v26.01:
|
.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11:
|
||||||
|
|
||||||
Key options
|
Key options
|
||||||
-----------
|
-----------
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ with Primus Turbo optimizations.
|
|||||||
- {{ component_version }}
|
- {{ component_version }}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-primus-pytorch-model-support-v26.01:
|
.. _amd-primus-pytorch-model-support-v25.11:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -91,7 +91,7 @@ vary by model -- select one to get started.
|
|||||||
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
|
||||||
see the documentation :doc:`pytorch-training` (without Primus)
|
see the documentation :doc:`pytorch-training` (without Primus)
|
||||||
|
|
||||||
.. _amd-primus-pytorch-performance-measurements-v26.01:
|
.. _amd-primus-pytorch-performance-measurements-v25.11:
|
||||||
|
|
||||||
System validation
|
System validation
|
||||||
=================
|
=================
|
||||||
@@ -146,7 +146,7 @@ tweak some configurations (such as batch sizes).
|
|||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
The following run command is tailored to {{ model.model }}.
|
The following run command is tailored to {{ model.model }}.
|
||||||
See :ref:`amd-primus-pytorch-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
@@ -184,7 +184,7 @@ tweak some configurations (such as batch sizes).
|
|||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
The following run commands are tailored to {{ model.model }}.
|
The following run commands are tailored to {{ model.model }}.
|
||||||
See :ref:`amd-primus-pytorch-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
.. rubric:: Download the Docker image and required packages
|
.. rubric:: Download the Docker image and required packages
|
||||||
|
|
||||||
@@ -220,8 +220,8 @@ tweak some configurations (such as batch sizes).
|
|||||||
docker start training_env
|
docker start training_env
|
||||||
docker exec -it training_env bash
|
docker exec -it training_env bash
|
||||||
|
|
||||||
The Docker container hosts verified commit ``9c529cd4`` of the `Primus
|
The Docker container hosts verified commit ``c4c083de`` of the `Primus
|
||||||
<https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167/>`__ repository.
|
<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
|
||||||
|
|
||||||
.. rubric:: Prepare training datasets and dependencies
|
.. rubric:: Prepare training datasets and dependencies
|
||||||
|
|
||||||
@@ -257,31 +257,24 @@ tweak some configurations (such as batch sizes).
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
.. tab-item:: MI325X
|
||||||
:sync: MI325X
|
:sync: MI325X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B.log \
|
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
|
||||||
--training.local_batch_size 6
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI300X
|
:sync: MI300X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
To train Llama 3.1 8B with FP8 precision, use the following command.
|
To train Llama 3.1 8B with FP8 precision, use the following command.
|
||||||
|
|
||||||
@@ -292,31 +285,24 @@ tweak some configurations (such as batch sizes).
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
.. tab-item:: MI325X
|
||||||
:sync: MI325X
|
:sync: MI325X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B_fp8.log \
|
bash examples/run_pretrain.sh --training.local_batch_size 7
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
|
||||||
--training.local_batch_size 7
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI300X
|
:sync: MI300X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_8B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
.. container:: model-doc primus_pyt_train_llama-3.1-70b
|
||||||
|
|
||||||
@@ -329,31 +315,24 @@ tweak some configurations (such as batch sizes).
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
.. tab-item:: MI325X
|
||||||
:sync: MI325X
|
:sync: MI325X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B.log \
|
bash examples/run_pretrain.sh --training.local_batch_size 6
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
|
||||||
--training.local_batch_size 6
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI300X
|
:sync: MI300X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml
|
|
||||||
|
|
||||||
To train Llama 3.1 70B with FP8 precision, use the following command.
|
To train Llama 3.1 70B with FP8 precision, use the following command.
|
||||||
|
|
||||||
@@ -364,31 +343,24 @@ tweak some configurations (such as batch sizes).
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
.. tab-item:: MI325X
|
||||||
:sync: MI325X
|
:sync: MI325X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B_fp8.log \
|
bash examples/run_pretrain.sh --training.local_batch_size 5
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
|
||||||
--training.local_batch_size 5
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI300X
|
:sync: MI300X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
|
||||||
--log_file /tmp/primus_llama3.1_70B_fp8.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml
|
|
||||||
|
|
||||||
.. container:: model-doc primus_pyt_train_deepseek-v3-16b
|
.. container:: model-doc primus_pyt_train_deepseek-v3-16b
|
||||||
|
|
||||||
@@ -401,31 +373,24 @@ tweak some configurations (such as batch sizes).
|
|||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v3_16b.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml
|
|
||||||
|
|
||||||
.. tab-item:: MI325X
|
.. tab-item:: MI325X
|
||||||
:sync: MI325X
|
:sync: MI325X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v3_16b.log \
|
bash examples/run_pretrain.sh --training.local_batch_size 10
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
|
||||||
--training.local_batch_size 10
|
|
||||||
|
|
||||||
.. tab-item:: MI300X
|
.. tab-item:: MI300X
|
||||||
:sync: MI300X
|
:sync: MI300X
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
bash runner/primus-cli direct \
|
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
|
||||||
--log_file /tmp/primus_deepseek_v3_16b.log \
|
bash examples/run_pretrain.sh
|
||||||
-- train pretrain \
|
|
||||||
--config examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ training workloads:
|
|||||||
- {{ component_version }}
|
- {{ component_version }}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support-v26.01:
|
.. _amd-pytorch-training-model-support-v25.11:
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -85,7 +85,7 @@ one to get started.
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _amd-pytorch-training-supported-training-modes-v26.01:
|
.. _amd-pytorch-training-supported-training-modes-v25.11:
|
||||||
|
|
||||||
The following table lists supported training modes per model.
|
The following table lists supported training modes per model.
|
||||||
|
|
||||||
@@ -120,7 +120,7 @@ The following table lists supported training modes per model.
|
|||||||
unlisted fine-tuning methods by using an existing file in the
|
unlisted fine-tuning methods by using an existing file in the
|
||||||
``/workspace/torchtune/recipes/configs`` directory as a template.
|
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements-v26.01:
|
.. _amd-pytorch-training-performance-measurements-v25.11:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
@@ -176,7 +176,7 @@ Run training
|
|||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
The following run command is tailored to {{ model.model }}.
|
The following run command is tailored to {{ model.model }}.
|
||||||
See :ref:`amd-pytorch-training-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
@@ -214,7 +214,7 @@ Run training
|
|||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
The following commands are tailored to {{ model.model }}.
|
The following commands are tailored to {{ model.model }}.
|
||||||
See :ref:`amd-pytorch-training-model-support-v26.01` to switch to another available model.
|
See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
@@ -409,10 +409,6 @@ Run training
|
|||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_dlrm" %}
|
{% if model.mad_tag == "pyt_train_dlrm" %}
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
DLRM is supported on MI300X, MI325X, MI350X, and MI355X GPUs.
|
|
||||||
|
|
||||||
1. Go to the DLRM directory.
|
1. Go to the DLRM directory.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
@@ -536,7 +532,7 @@ Run training
|
|||||||
|
|
||||||
To start the fine-tuning benchmark, use the following command with the
|
To start the fine-tuning benchmark, use the following command with the
|
||||||
appropriate options. See the following list of options and their descriptions.
|
appropriate options. See the following list of options and their descriptions.
|
||||||
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v26.01>`.
|
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
@@ -601,7 +597,7 @@ Run training
|
|||||||
|
|
||||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||||
|
|
||||||
.. _amd-pytorch-training-multinode-examples-v26.01:
|
.. _amd-pytorch-training-multinode-examples-v25.11:
|
||||||
|
|
||||||
Multi-node training
|
Multi-node training
|
||||||
-------------------
|
-------------------
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
(artificial-intelligence-apis)=
|
(artificial-intelligence-apis)=
|
||||||
|
|
||||||
* {doc}`Composable Kernel <composable_kernel:index>`
|
* {doc}`Composable Kernel <composable_kernel:index>`
|
||||||
|
* {doc}`hipDNN <hipdnn:index>`
|
||||||
* {doc}`MIGraphX <amdmigraphx:index>`
|
* {doc}`MIGraphX <amdmigraphx:index>`
|
||||||
* {doc}`MIOpen <miopen:index>`
|
* {doc}`MIOpen <miopen:index>`
|
||||||
* {doc}`MIVisionX <mivisionx:index>`
|
* {doc}`MIVisionX <mivisionx:index>`
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ Machine Learning & Computer Vision
|
|||||||
:header: "Component", "Description"
|
:header: "Component", "Description"
|
||||||
|
|
||||||
":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
|
":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
|
||||||
|
":doc:`hipDNN <hipdnn:index>`", "A graph-based deep learning library that enables multi-operation fusion for improved performance on AMD GPUs. "
|
||||||
":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
|
":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
|
||||||
":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
|
":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
|
||||||
":doc:`MIVisionX <mivisionx:index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
|
":doc:`MIVisionX <mivisionx:index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"
|
||||||
|
|||||||
Reference in New Issue
Block a user