Update training Docker docs for Primus 25.10 (#5737)

This commit is contained in:
peterjunpark
2025-12-04 09:08:00 -05:00
committed by GitHub
parent e8fdc34b71
commit fb644412d5
18 changed files with 4158 additions and 560 deletions

View File

@@ -1,21 +1,17 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/megatron-lm:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/megatron-lm:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
Triton: 3.4.0
RCCL: 2.27.7
model_groups:
- group: Meta Llama
tag: llama

View File

@@ -0,0 +1,53 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/megatron-lm:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/megatron-lm:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.3 70B
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
- model: Llama 3.1 8B
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
- model: Llama 3.1 70B
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
- model: Llama 2 7B
mad_tag: pyt_megatron_lm_train_llama-2-7b
- model: Llama 2 70B
mad_tag: pyt_megatron_lm_train_llama-2-70b
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek-V3 (proxy)
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
- model: DeepSeek-V2-Lite
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
- group: Mistral AI
tag: mistral
models:
- model: Mixtral 8x7B
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
- model: Mixtral 8x22B (proxy)
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
- group: Qwen
tag: qwen
models:
- model: Qwen 2.5 7B
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
- model: Qwen 2.5 72B
mad_tag: pyt_megatron_lm_train_qwen2.5-72b

View File

@@ -0,0 +1,65 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.3 70B
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
config_name: llama3.3_70B-pretrain.yaml
- model: Llama 3.1 70B
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
config_name: llama3.1_70B-pretrain.yaml
- model: Llama 3.1 8B
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
config_name: llama3.1_8B-pretrain.yaml
- model: Llama 2 7B
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
config_name: llama2_7B-pretrain.yaml
- model: Llama 2 70B
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
config_name: llama2_70B-pretrain.yaml
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek-V3 (proxy)
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
config_name: deepseek_v3-pretrain.yaml
- model: DeepSeek-V2-Lite
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
config_name: deepseek_v2_lite-pretrain.yaml
- group: Mistral AI
tag: mistral
models:
- model: Mixtral 8x7B
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
config_name: mixtral_8x7B_v0.1-pretrain.yaml
- model: Mixtral 8x22B (proxy)
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
config_name: mixtral_8x22B_v0.1-pretrain.yaml
- group: Qwen
tag: qwen
models:
- model: Qwen 2.5 7B
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
config_name: primus_qwen2.5_7B-pretrain.yaml
- model: Qwen 2.5 72B
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
config_name: qwen2.5_72B-pretrain.yaml

View File

@@ -0,0 +1,39 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
config_file:
bf16: "./llama3_8b_fsdp_bf16.toml"
fp8: "./llama3_8b_fsdp_fp8.toml"
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
config_file:
bf16: "./llama3_70b_fsdp_bf16.toml"
fp8: "./llama3_70b_fsdp_fp8.toml"

View File

@@ -0,0 +1,186 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/pytorch-training:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/pytorch-training:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 4 Scout 17B-16E
mad_tag: pyt_train_llama-4-scout-17b-16e
model_repo: Llama-4-17B_16E
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.3 70B
mad_tag: pyt_train_llama-3.3-70b
model_repo: Llama-3.3-70B
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.2 1B
mad_tag: pyt_train_llama-3.2-1b
model_repo: Llama-3.2-1B
url: https://huggingface.co/meta-llama/Llama-3.2-1B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 3B
mad_tag: pyt_train_llama-3.2-3b
model_repo: Llama-3.2-3B
url: https://huggingface.co/meta-llama/Llama-3.2-3B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 Vision 11B
mad_tag: pyt_train_llama-3.2-vision-11b
model_repo: Llama-3.2-Vision-11B
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.2 Vision 90B
mad_tag: pyt_train_llama-3.2-vision-90b
model_repo: Llama-3.2-Vision-90B
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [pretrain, finetune_fw, finetune_lora]
- model: Llama 3.1 405B
mad_tag: pyt_train_llama-3.1-405b
model_repo: Llama-3.1-405B
url: https://huggingface.co/meta-llama/Llama-3.1-405B
precision: BF16
training_modes: [finetune_qlora]
- model: Llama 3 8B
mad_tag: pyt_train_llama-3-8b
model_repo: Llama-3-8B
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3 70B
mad_tag: pyt_train_llama-3-70b
model_repo: Llama-3-70B
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 7B
mad_tag: pyt_train_llama-2-7b
model_repo: Llama-2-7B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 2 13B
mad_tag: pyt_train_llama-2-13b
model_repo: Llama-2-13B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 70B
mad_tag: pyt_train_llama-2-70b
model_repo: Llama-2-70B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_lora, finetune_qlora]
- group: OpenAI
tag: openai
models:
- model: GPT OSS 20B
mad_tag: pyt_train_gpt_oss_20b
model_repo: GPT-OSS-20B
url: https://huggingface.co/openai/gpt-oss-20b
precision: BF16
training_modes: [HF_finetune_lora]
- model: GPT OSS 120B
mad_tag: pyt_train_gpt_oss_120b
model_repo: GPT-OSS-120B
url: https://huggingface.co/openai/gpt-oss-120b
precision: BF16
training_modes: [HF_finetune_lora]
- group: Qwen
tag: qwen
models:
- model: Qwen 3 8B
mad_tag: pyt_train_qwen3-8b
model_repo: Qwen3-8B
url: https://huggingface.co/Qwen/Qwen3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 3 32B
mad_tag: pyt_train_qwen3-32b
model_repo: Qwen3-32
url: https://huggingface.co/Qwen/Qwen3-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 32B
mad_tag: pyt_train_qwen2.5-32b
model_repo: Qwen2.5-32B
url: https://huggingface.co/Qwen/Qwen2.5-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 72B
mad_tag: pyt_train_qwen2.5-72b
model_repo: Qwen2.5-72B
url: https://huggingface.co/Qwen/Qwen2.5-72B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2 1.5B
mad_tag: pyt_train_qwen2-1.5b
model_repo: Qwen2-1.5B
url: https://huggingface.co/Qwen/Qwen2-1.5B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 2 7B
mad_tag: pyt_train_qwen2-7b
model_repo: Qwen2-7B
url: https://huggingface.co/Qwen/Qwen2-7B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- group: Stable Diffusion
tag: sd
models:
- model: Stable Diffusion XL
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
model_repo: SDXL
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
precision: BF16
training_modes: [posttrain-p]
- group: Flux
tag: flux
models:
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [posttrain-p]
- group: NCF
tag: ncf
models:
- model: NCF
mad_tag: pyt_ncf_training
model_repo:
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
precision: FP32

View File

@@ -1,22 +1,15 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
Triton: 3.4.0
RCCL: 2.27.7
model_groups:
- group: Meta Llama
tag: llama

View File

@@ -1,39 +1,32 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
config_file:
bf16: "./llama3_8b_fsdp_bf16.toml"
fp8: "./llama3_8b_fsdp_fp8.toml"
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
config_file:
bf16: "./llama3_70b_fsdp_bf16.toml"
fp8: "./llama3_70b_fsdp_fp8.toml"
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek V2 16B
mad_tag: primus_pyt_train_deepseek-v2
model_repo: DeepSeek-V2
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
precision: BF16

View File

@@ -1,21 +1,15 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/pytorch-training:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/pytorch-training:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
model_groups:
- group: Meta Llama
tag: llama
@@ -119,6 +113,15 @@ model_groups:
url: https://huggingface.co/openai/gpt-oss-120b
precision: BF16
training_modes: [HF_finetune_lora]
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek V2 16B
mad_tag: primus_pyt_train_deepseek-v2
model_repo: DeepSeek-V2
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
precision: BF16
training_modes: [pretrain]
- group: Qwen
tag: qwen
models:
@@ -166,7 +169,7 @@ model_groups:
model_repo: SDXL
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
precision: BF16
training_modes: [posttrain-p]
training_modes: [posttrain]
- group: Flux
tag: flux
models:
@@ -175,12 +178,20 @@ model_groups:
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [posttrain-p]
training_modes: [posttrain]
- group: NCF
tag: ncf
models:
- model: NCF
mad_tag: pyt_ncf_training
model_repo:
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
url: https://github.com/ROCm/FluxBenchmark
precision: FP32
- group: DLRM
tag: dlrm
models:
- model: DLRM v2
mad_tag: pyt_train_dlrm
model_repo: DLRM
url: https://github.com/AMD-AGI/DLRMBenchmark
training_modes: [pretrain]

View File

@@ -36,12 +36,10 @@ accelerate training workloads:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: {{ data.docker.pull_tag }}
:sync: {{ data.docker.pull_tag }}
.. list-table::
:header-rows: 1
@@ -49,12 +47,12 @@ accelerate training workloads:
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
{% for component_name, component_version in data.docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-megatron-lm-model-support:
.. _amd-megatron-lm-model-support-v2510:
Supported models
================
@@ -99,7 +97,7 @@ accelerate training workloads:
Some models, such as Llama, require an external license agreement through
a third party (for example, Meta).
.. _amd-megatron-lm-performance-measurements:
.. _amd-megatron-lm-performance-measurements-v2510:
Performance measurements
========================
@@ -131,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
.. _mi300x-amd-megatron-lm-training:
.. _mi300x-amd-megatron-lm-training-v2510:
Environment setup
=================
@@ -140,52 +138,38 @@ Use the following instructions to set up the environment, configure the script t
reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
image.
.. _amd-megatron-lm-requirements:
.. _amd-megatron-lm-requirements-v2510:
Download the Docker image
-------------------------
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set docker = data.docker %}
1. Use the following command to pull the Docker image from Docker Hub.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
docker pull {{ docker.pull_tag }}
2. Launch the Docker container.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--device /dev/infiniband \
--network host --ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 128G \
--name megatron_training_env \
{{ docker.pull_tag }}
{% endfor %}
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--device /dev/infiniband \
--network host --ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 128G \
--name megatron_training_env \
{{ docker.pull_tag }}
3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
@@ -206,7 +190,7 @@ Download the Docker image
The Docker container hosts a verified commit of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
.. _amd-megatron-lm-environment-setup:
.. _amd-megatron-lm-environment-setup-v2510:
Configuration
=============
@@ -216,39 +200,39 @@ Configuration
Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
directory of
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
.. note::
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v2510>` for more information on configuration options.
Multi-node configuration
------------------------
@@ -256,7 +240,7 @@ Multi-node configuration
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
.. _amd-megatron-lm-tokenizer:
.. _amd-megatron-lm-tokenizer-v2510:
Tokenizer
---------
@@ -393,7 +377,7 @@ Download the dataset
``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
Remember to either pre-download the tokenizer or setup Hugging Face access
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v2510>` section.
.. note::
@@ -495,15 +479,38 @@ Download the dataset
Ensure that the files are accessible inside the Docker container.
.. _amd-megatron-lm-run-training:
.. _amd-megatron-lm-run-training-v2510:
Run training
============
Use the following example commands to set up the environment, configure
:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
:ref:`key options <amd-megatron-lm-benchmark-test-vars-v2510>`, and run training on
MI300X Series GPUs with the AMD Megatron-LM environment.
Before starting training, export the following environment variables.
.. tab-set::
.. tab-item:: MI355X and MI350X
.. code-block:: shell
export HSA_NO_SCRATCH_RECLAIM=1
export NVTE_CK_USES_BWD_V3=1
export NVTE_CK_USES_BWD_V3=1
.. tab-item:: MI325X and MI300X
.. code-block:: shell
export HSA_NO_SCRATCH_RECLAIM=1
export NVTE_CK_USES_BWD_V3=1
export NVTE_CK_USES_BWD_V3=1
# Set this on MI325X/MI300X only
export NVTE_CK_IS_V3_ATOMIC_FP32=1
Single node training
--------------------
@@ -913,7 +920,7 @@ Single node training
RECOMPUTE_ACTIVATIONS=full \
CKPT_FORMAT=torch_dist
.. _amd-megatron-lm-multi-node-examples:
.. _amd-megatron-lm-multi-node-examples-v2510:
Multi-node training examples
----------------------------
@@ -964,7 +971,7 @@ training on 16 nodes, try the following command:
sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
.. _amd-megatron-lm-benchmark-test-vars:
.. _amd-megatron-lm-benchmark-test-vars-v2510:
Key options
-----------
@@ -1029,11 +1036,6 @@ The benchmark tests support the following sets of variables.
``RECOMPUTE_NUM_LAYERS``
Number of layers used for checkpointing recompute.
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
Previous versions
=================

View File

@@ -16,14 +16,23 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
- Components
- Resources
* - v25.9 (latest)
* - v25.10 (latest)
-
* ROCm 7.1.0
* PyTorch 2.10.0.dev20251112+rocm7.1
-
* :doc:`Primus Megatron documentation <../primus-megatron>`
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
* - v25.9
-
* ROCm 7.0.0
* Primus 0.3.0
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-
* :doc:`Primus Megatron documentation <../primus-megatron>`
* :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
* :doc:`Primus Megatron documentation <primus-megatron-v25.9>`
* :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.9>`
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__

View File

@@ -0,0 +1,574 @@
:orphan:
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
****************************************
Training a model with Primus and PyTorch
****************************************
.. caution::
This documentation does not reflect the latest version of ROCm Primus PyTorch training
performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
LLM training framework designed to streamline training. It streamlines LLM
training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
Primus now supports the PyTorch torchtitan backend.
.. note::
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
Primus with the PyTorch torchtitan backend is designed to replace the
:doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
:doc:`../pytorch-training` to see steps to run workloads without Primus.
AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
MI300X GPUs containing essential components for Primus and PyTorch training
with Primus Turbo optimizations.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-primus-pytorch-model-support-v259:
Supported models
================
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
Some instructions, commands, and training recommendations in this documentation might
vary by model -- select one to get started.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
{% set model_groups = data.model_groups %}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. seealso::
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
see the documentation :doc:`../pytorch-training` (without Primus)
.. _amd-primus-pytorch-performance-measurements-v259:
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt test configurations and run conditions outside those described.
Pull the Docker image
=====================
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
{% set dockers = data.dockers %}
Use the following command to pull the Docker image from Docker Hub.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
Run training
============
Once the setup is complete, choose between the following two workflows to start benchmarking training.
For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
tweak some configurations (such as batch sizes).
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set model_groups = data.model_groups %}
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following run command is tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one node with the {{ model.precision }} data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{ model.mad_tag }} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
model are collected in ``~/MAD/perf.csv``.
.. note::
Currently, Primus torchtitan models are run with Primus Turbo
enabled for enhanced performance. To disable Primus Turbo,
modify respective configuration file
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
{% endfor %}
{% endfor %}
.. tab-item:: Primus benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following run commands are tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
.. rubric:: Download the Docker image and required packages
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
2. Run the Docker container.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
.. rubric:: Prepare training datasets and dependencies
The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
.. rubric:: Pretraining
To get started, navigate to the ``Primus`` directory in your container.
.. code-block::
cd /workspace/Primus
Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
included with Primus with the appropriate options.
.. rubric:: Benchmarking examples
.. container:: model-doc primus_pyt_train_llama-3.1-8b
Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
.. tab-item:: MI300X
:sync: MI325X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 4
To train Llama 3.1 8B with FP8 precision, use the following command.
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 8
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 7
.. tab-item:: MI300X
:sync: MI325X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
.. container:: model-doc primus_pyt_train_llama-3.1-70b
Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 8
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
.. tab-item:: MI300X
:sync: MI325X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 4
To train Llama 3.1 70B with FP8 precision, use the following command.
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
.. tab-item:: MI300X
:sync: MI325X and MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 3
{% endfor %}
{% endfor %}
.. tab-item:: Standalone torchtitan benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following run commands are tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
.. rubric:: Download the Docker image and required packages
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
2. Run the Docker container.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
3. Navigate to the ``torchtitan`` workspace directory.
.. code-block:: shell
cd /workspace/torchtitan
.. rubric:: Download the tokenizer
1. The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
2. Download the tokenizer for your model.
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
python3 scripts/download_tokenizer.py \
--repo_id {{ model.model_repo }} \
--tokenizer_path "original" \
--hf_token=${HF_TOKEN}
.. rubric:: Pretraining examples
Run the training script with the appropriate configuration file.
For train with BF16 precicion, use the following command:
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
CONFIG_FILE={{ model.config_file.bf16 }} \
.run_train.sh
For train with BF16 precicion, use the following command:
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
CONFIG_FILE={{ model.config_file.fp8 }} \
.run_train.sh
{% endfor %}
{% endfor %}
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
Further reading
===============
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================
See :doc:`pytorch-training-history` to find documentation for previous releases
of the ``ROCm/pytorch-training`` Docker image.

View File

@@ -16,14 +16,23 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
- Components
- Resources
* - v25.9 (latest)
* - v25.10 (latest)
-
* ROCm 7.1.0
* PyTorch 2.10.0.dev20251112+rocm7.1
-
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
* `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
* - v25.9
-
* ROCm 7.0.0
* Primus 0.3.0
* PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-
* :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
* :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
* :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
* :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
* `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
* `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__

View File

@@ -0,0 +1,667 @@
:orphan:
.. meta::
:description: How to train a model using PyTorch for ROCm.
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
**************************************
Training a model with PyTorch on ROCm
**************************************
.. caution::
This documentation does not reflect the latest version of ROCm PyTorch training
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
.. note::
For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
<https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
See :doc:`../primus-pytorch` for details.
PyTorch is an open-source machine learning framework that is widely used for
model training with GPU-optimized components for transformer-based models.
The PyTorch for ROCm training Docker image provides a prebuilt optimized
environment for fine-tuning and pretraining a model on AMD Instinct MI325X
and MI300X GPUs. It includes the following software components to accelerate
training workloads:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. list-table::
:header-rows: 1
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-pytorch-training-model-support-v259:
Supported models
================
The following models are pre-optimized for performance on the AMD Instinct
MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
training recommendations in this documentation might vary by model -- select
one to get started.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
{% set model_groups = data.model_groups %}
.. raw:: html
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
<div class="row gx-0">
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
<div class="row gx-0 pt-1">
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if models|length % 3 == 0 %}
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% else %}
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
</div>
</div>
.. _amd-pytorch-training-supported-training-modes-v259:
The following table lists supported training modes per model.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
{% set model_groups = data.model_groups %}
.. dropdown:: Supported training modes
.. list-table::
:header-rows: 1
* - Model
- Supported training modes
{% for model_group in model_groups %}
{% set models = model_group.models %}
{% for model in models %}
{% if model.training_modes %}
* - {{ model.model }}
- ``{{ model.training_modes | join('``, ``') }}``
{% endif %}
{% endfor %}
{% endfor %}
.. note::
Some model and fine-tuning combinations are not listed. This is
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
doesn't provide default YAML configurations for them.
For advanced usage, you can create a custom configuration to enable
unlisted fine-tuning methods by using an existing file in the
``/workspace/torchtune/recipes/configs`` directory as a template.
.. _amd-pytorch-training-performance-measurements-v259:
Performance measurements
========================
To evaluate performance, the
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
page provides reference throughput and latency measurements for training
popular AI models.
.. note::
The performance data presented in
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
should not be interpreted as the peak performance achievable by AMD
Instinct MI325X and MI300X GPUs or ROCm software.
System validation
=================
Before running AI workloads, it's important to validate that your AMD hardware is configured
correctly and performing optimally.
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
before starting training.
To test for optimal performance, consult the recommended :ref:`System health benchmarks
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
This Docker image is optimized for specific model configurations outlined
below. Performance can vary for other training workloads, as AMD
doesnt test configurations and run conditions outside those described.
Run training
============
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set model_groups = data.model_groups %}
Once the setup is complete, choose between two options to start benchmarking training:
.. tab-set::
.. tab-item:: MAD-integrated benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following run command is tailored to {{ model.model }}.
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD
pip install -r requirements.txt
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
using one node with the {{ model.precision }} data type on the host machine.
.. code-block:: shell
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
madengine run \
--tags {{ model.mad_tag }} \
--keep-model-dir \
--live-output \
--timeout 28800
MAD launches a Docker container with the name
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
model are collected in ``~/MAD/perf.csv``.
{% endfor %}
{% endfor %}
.. tab-item:: Standalone benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following commands are tailored to {{ model.model }}.
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
{% endfor %}
{% endfor %}
.. rubric:: Download the Docker image and required packages
1. Use the following command to pull the Docker image from Docker Hub.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
2. Launch the Docker container.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
Use these commands if you exit the ``training_env`` container and need to return to it.
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
repository and navigate to the benchmark scripts directory
``/workspace/MAD/scripts/pytorch_train``.
.. code-block:: shell
git clone https://github.com/ROCm/MAD
cd MAD/scripts/pytorch_train
.. rubric:: Prepare training datasets and dependencies
1. The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
2. Run the setup script to install libraries and datasets needed for benchmarking.
.. code-block:: shell
./pytorch_benchmark_setup.sh
.. container:: model-doc pyt_train_llama-3.1-8b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
.. container:: model-doc pyt_train_llama-3.1-70b
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
* - ``torchdata``
- `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
* - ``tomli``
- `Tomli <https://pypi.org/project/tomli/>`__
* - ``tiktoken``
- `tiktoken <https://github.com/openai/tiktoken>`__
* - ``blobfile``
- `blobfile <https://pypi.org/project/blobfile/>`__
* - ``tabulate``
- `tabulate <https://pypi.org/project/tabulate/>`__
* - ``wandb``
- `Weights & Biases <https://github.com/wandb/wandb>`__
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
.. container:: model-doc pyt_train_flux
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
.. list-table::
:header-rows: 1
* - Library
- Reference
* - ``accelerate``
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
* - ``datasets``
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
* - ``sentencepiece``
- `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
* - ``tensorboard``
- `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
* - ``csvkit``
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
* - ``deepspeed``
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
* - ``diffusers``
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
* - ``GitPython``
- `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
* - ``opencv-python-headless``
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
* - ``peft``
- `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
* - ``protobuf``
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
* - ``pytest``
- `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
* - ``python-dotenv``
- `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
* - ``seaborn``
- `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
* - ``transformers``
- `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
* `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
{% for model_group in model_groups %}
{% for model in model_group.models %}
{% set training_modes = model.training_modes %}
{% set training_mode_descs = {
"pretrain": "Benchmark pre-training.",
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
} %}
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
{% if available_modes %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Pre-training
To start the pre-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
-m {{ model.model_repo }} \
-p $datatype \
-s $sequence_length
{% if model.mad_tag == "pyt_train_flux" %}
.. container:: model-doc {{ model.mad_tag }}
.. note::
Currently, FLUX models are not supported out-of-the-box on this Docker.
To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
Occasionally, downloading the Flux dataset might fail. In the event of this
error, manually download it from Hugging Face at
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
the required dataset.
{% endif %}
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% for mode in available_modes %}
* - {% if loop.first %}``$training_mode``{% endif %}
- ``{{ mode }}``
- {{ training_mode_descs[mode] }}
{% endfor %}
* - ``$datatype``
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
- Only Llama 3.1 8B supports FP8 precision.
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
{% endif %}
{% set training_modes = model.training_modes %}
{% set training_mode_descs = {
"posttrain": "Benchmark post-training.",
} %}
{% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
{% if available_modes %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Post-training
To start the post-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
.. code-block:: shell
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
-m {{ model.model_repo }} \
-p $datatype \
-s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% for mode in available_modes %}
* - {% if loop.first %}``$training_mode``{% endif %}
- ``{{ mode }}``
- {{ training_mode_descs[mode] }}
{% endfor %}
* - ``$datatype``
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
- Only Llama 3.1 8B supports FP8 precision.
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
{% endif %}
{% set training_mode_descs = {
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
} %}
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
{% if available_modes %}
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Fine-tuning
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
.. code-block:: shell
./pytorch_benchmark_report.sh -t $training_mode \
-m {{ model.model_repo }} \
-p $datatype \
-s $sequence_length
.. list-table::
:header-rows: 1
* - Name
- Options
- Description
{% for mode in available_modes %}
* - {% if loop.first %}``$training_mode``{% endif %}
- ``{{ mode }}``
- {{ training_mode_descs[mode] }}
{% endfor %}
* - ``$datatype``
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
* - ``$sequence_length``
- Between 2048 and 16384.
- Sequence length for the language model.
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
.. note::
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
use the following torchtune commit for compatibility:
.. code-block:: shell
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
.. note::
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
input tensor should be smaller than max_seq_len (4096)``.
This error indicates that an input sequence is longer than the model's maximum context window.
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
tokens in this case). You can resolve this by truncating the input or splitting
it into smaller chunks before passing it to the model.
Note on reproducibility: The results in this guide are based on
commit ``b4c98ac`` from the upstream
`<https://github.com/pytorch/torchtune>`__ repository. For the
latest updates, you can use the main branch.
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
.. rubric:: Benchmarking examples
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
.. _amd-pytorch-training-multinode-examples-v259:
Multi-node training
-------------------
Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
Pre-training
~~~~~~~~~~~~
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
.. code-block:: shell
# In the MAD repository
cd scripts/pytorch_train
sbatch run_slurm_train.sh
Fine-tuning
~~~~~~~~~~~
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
.. code-block:: shell
huggingface-cli login # Get access to HF Llama model space
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
# In the MAD repository
cd scripts/pytorch_train
sbatch Torchtune_Multinode.sh
.. note::
Information regarding benchmark setup:
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
* You can adjust the torchtune `YAML configuration file
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
if you're using a different model.
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
* Set the ``mounting_paths`` inside the SLURM script.
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
Further reading
===============
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
- To learn more about system settings and management practices to configure your system for
AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
- For a list of other ready-made Docker images for AI with ROCm, see
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
Previous versions
=================
See :doc:`pytorch-training-history` to find documentation for previous releases
of the ``ROCm/pytorch-training`` Docker image.

View File

@@ -31,12 +31,10 @@ Megatron-LM.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: {{ data.docker.pull_tag }}
:sync: {{ data.docker.pull_tag }}
.. list-table::
:header-rows: 1
@@ -44,13 +42,12 @@ Megatron-LM.
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
{% for component_name, component_version in data.docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-primus-megatron-lm-model-support-v259:
.. _amd-primus-megatron-lm-model-support-v2510:
Supported models
================
@@ -111,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
system's configuration.
.. _mi300x-amd-primus-megatron-lm-training-v259:
.. _mi300x-amd-primus-megatron-lm-training-v2510:
Environment setup
=================
@@ -121,63 +118,49 @@ Environment setup
Use the following instructions to set up the environment, configure the script to train models, and
reproduce the benchmark results on AMD Instinct GPUs.
.. _amd-primus-megatron-lm-requirements-v259:
.. _amd-primus-megatron-lm-requirements-v2510:
Pull the Docker image
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set docker = data.docker %}
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
docker pull {{ docker.pull_tag }}
2. Launch the Docker container.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--device /dev/infiniband \
--network host --ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
--shm-size 128G \
--name primus_training_env \
{{ docker.pull_tag }}
.. code-block:: shell
Use these commands if you exit the ``primus_training_env`` container and need to return to it.
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--device /dev/infiniband \
--network host --ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
--shm-size 128G \
--name primus_training_env \
{{ docker.pull_tag }}
{% endfor %}
.. code-block:: shell
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
docker start primus_training_env
docker exec -it primus_training_env bash
.. code-block:: shell
The Docker container hosts verified branch ``release/v25.10`` of the `Primus
<https://github.com/AMD-AGI/Primus/tree/release/v25.10>`__ repository.
docker start primus_training_env
docker exec -it primus_training_env bash
The Docker container hosts verified commit ``e16b27b`` of the `Primus
<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
.. _amd-primus-megatron-lm-environment-setup-v259:
.. _amd-primus-megatron-lm-environment-setup-v2510:
Configuration
=============
@@ -224,7 +207,7 @@ You can use either mock data or real data for training.
Ensure that the files are accessible inside the Docker container.
.. _amd-primus-megatron-lm-tokenizer-v259:
.. _amd-primus-megatron-lm-tokenizer-v2510:
Tokenizer
---------
@@ -245,7 +228,7 @@ right permissions to access the tokenizer for each model.
<https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
definition.
.. _amd-primus-megatron-lm-run-training-v259:
.. _amd-primus-megatron-lm-run-training-v2510:
Run training
============
@@ -269,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.3 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run pre-training for Llama 3.3 70B BF16, run:
@@ -280,7 +263,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 6 \
@@ -291,7 +274,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 2 \
@@ -301,7 +289,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.1 8B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run pre-training for Llama 3.1 8B FP8, run:
@@ -312,7 +300,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid \
@@ -324,7 +312,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid
@@ -338,7 +331,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 4 \
@@ -349,7 +342,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50
@@ -357,7 +355,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.1 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run pre-training for Llama 3.1 70B BF16, run:
@@ -368,7 +366,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 4 \
@@ -379,7 +377,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50
@@ -398,7 +401,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid \
@@ -411,7 +414,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--num_layers 40 \
@@ -422,7 +430,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 7B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run pre-training for Llama 2 7B FP8, run:
@@ -433,7 +441,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid \
@@ -445,7 +453,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid
@@ -459,7 +472,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 10 \
@@ -470,7 +483,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50
@@ -478,7 +496,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run pre-training for Llama 2 70B BF16, run:
@@ -489,7 +507,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 17 \
@@ -500,7 +518,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \
bash ./examples/run_pretrain.sh \
--train_iters 50
@@ -508,7 +531,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to DeepSeek-V3.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
use the following command:
@@ -520,7 +543,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \
bash examples/run_pretrain.sh \
--num_layers 3 \
--moe_layer_freq 1 \
@@ -533,17 +556,24 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \
bash examples/run_pretrain.sh \
--num_layers 3 \
--moe_layer_freq 1 \
--micro_batch_size 3 \
--global_batch_size 192 \
--train_iters 50
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
Once setup is complete, run the appropriate training command.
The following run commands are tailored to DeepSeek-V2-Lite.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
use the following command:
@@ -555,7 +585,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 12 \
@@ -566,7 +596,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--global_batch_size 256
@@ -575,7 +610,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Mixtral 8x7B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
use the following command:
@@ -587,7 +622,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 4 \
@@ -598,7 +633,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50
@@ -606,7 +646,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Mixtral 8x22B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
use the following command:
@@ -618,7 +658,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--num_layers 4 \
@@ -631,7 +671,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--num_layers 4 \
@@ -643,7 +688,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Qwen 2.5 7B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run training on a single node for Qwen 2.5 7B BF16, use the following
command:
@@ -655,7 +700,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 16 \
@@ -666,7 +711,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50
@@ -679,7 +729,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid
@@ -691,7 +741,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--fp8 hybrid
@@ -700,7 +755,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Qwen 2.5 72B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
@@ -711,7 +766,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50 \
--micro_batch_size 16 \
@@ -722,11 +777,16 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
.. code-block:: shell
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \
bash examples/run_pretrain.sh \
--train_iters 50
.. _amd-primus-megatron-multi-node-examples-v259:
.. _amd-primus-megatron-multi-node-examples-v2510:
Multi-node training examples
----------------------------
@@ -740,28 +800,27 @@ to launch the multi-node workload. Use the following steps to setup your environ
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% set docker = data.docker %}
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
cd Primus
git checkout release/v25.10
git submodule update --init --recursive
.. code-block:: shell
export DOCKER_IMAGE={{ docker.pull_tag }}
export HF_TOKEN=<your_HF_token>
export HSA_NO_SCRATCH_RECLAIM=1
export NVTE_CK_USES_BWD_V3=1
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
cd Primus
git checkout e16b27b
export DOCKER_IMAGE={{ docker.pull_tag }}
export HF_TOKEN=<your_HF_token>
export HSA_NO_SCRATCH_RECLAIM=1
export NVTE_CK_USES_BWD_V3=1
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
{% endfor %}
# Set the variables for better performance
# only on MI325X and MI300X
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
export NVTE_CK_IS_V3_ATOMIC_FP32=1
.. note::
@@ -769,13 +828,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
* To find your network interface, you can use ``ip a``.
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
* Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate.
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.1 8B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Llama 3.1 8B FP8 on 8 nodes, run:
@@ -793,7 +852,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 7B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Llama 2 7B FP8 on 8 nodes, run:
@@ -811,7 +870,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.1 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Llama 3.1 70B FP8 on 8 nodes, run:
@@ -843,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Llama 2 70B FP8 on 8 nodes, run:
@@ -875,7 +934,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 3.3 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Llama 3.3 70B FP8 on 8 nodes, run:
@@ -907,7 +966,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Mixtral 8x7B BF16 on 8 nodes, run:
@@ -925,7 +984,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
Once setup is complete, run the appropriate training command.
The following run commands are tailored to Llama 2 70B.
See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
To train Qwen2.5 72B FP8 on 8 nodes, run:
@@ -942,7 +1001,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
--no_fp8_weight_transpose_cache true \
--fp8 hybrid
.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
.. _amd-primus-megatron-lm-benchmark-test-vars-v2510:
Key options
-----------
@@ -987,7 +1046,10 @@ num_layers
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error
due to a memory free issue. However, this does not impacts training runs. All
iterations, in this case 50, should have been completed before the exit and
the results should be available in the end.
Further reading
===============

View File

@@ -29,12 +29,10 @@ with Primus Turbo optimizations.
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: {{ data.docker.pull_tag }}
:sync: {{ data.docker.pull_tag }}
.. list-table::
:header-rows: 1
@@ -42,13 +40,12 @@ with Primus Turbo optimizations.
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
{% for component_name, component_version in data.docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-primus-pytorch-model-support-v259:
.. _amd-primus-pytorch-model-support-v2510:
Supported models
================
@@ -67,7 +64,7 @@ vary by model -- select one to get started.
<div class="col-2 me-1 px-2 model-param-head">Model</div>
<div class="row col-10 pe-0">
{% for model_group in model_groups %}
<div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
<div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
{% endfor %}
</div>
</div>
@@ -94,7 +91,7 @@ vary by model -- select one to get started.
For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
see the documentation :doc:`pytorch-training` (without Primus)
.. _amd-primus-pytorch-performance-measurements-v259:
.. _amd-primus-pytorch-performance-measurements-v2510:
System validation
=================
@@ -120,20 +117,11 @@ Pull the Docker image
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
{% set dockers = data.dockers %}
Use the following command to pull the Docker image from Docker Hub.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
docker pull {{ data.docker.pull_tag }}
Run training
============
@@ -145,7 +133,7 @@ tweak some configurations (such as batch sizes).
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set docker = data.docker %}
{% set model_groups = data.model_groups %}
.. tab-set::
@@ -158,7 +146,7 @@ tweak some configurations (such as batch sizes).
.. container:: model-doc {{ model.mad_tag }}
The following run command is tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
@@ -185,13 +173,6 @@ tweak some configurations (such as batch sizes).
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
model are collected in ``~/MAD/perf.csv``.
.. note::
Currently, Primus torchtitan models are run with Primus Turbo
enabled for enhanced performance. To disable Primus Turbo,
modify respective configuration file
``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
{% endfor %}
{% endfor %}
@@ -203,48 +184,34 @@ tweak some configurations (such as batch sizes).
.. container:: model-doc {{ model.mad_tag }}
The following run commands are tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
.. rubric:: Download the Docker image and required packages
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
docker pull {{ docker.pull_tag }}
2. Run the Docker container.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
Use these commands if you exit the ``training_env`` container and need to return to it.
@@ -283,37 +250,28 @@ tweak some configurations (such as batch sizes).
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
:sync: MI355X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 6
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 6
.. tab-item:: MI300X
:sync: MI325X and MI300X
:sync: MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 4
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 4
To train Llama 3.1 8B with FP8 precision, use the following command.
@@ -321,37 +279,28 @@ tweak some configurations (such as batch sizes).
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X and MI300X
:sync: MI355X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 8
EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 8
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 7
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 7
.. tab-item:: MI300X
:sync: MI325X and MI300X
:sync: MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 5
.. container:: model-doc primus_pyt_train_llama-3.1-70b
@@ -364,36 +313,57 @@ tweak some configurations (such as batch sizes).
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 8
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 8
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 6
.. tab-item:: MI300X
:sync: MI325X and MI300X
:sync: MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 4
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 4
To train Llama 3.1 70B with FP8 precision, use the following command.
.. tab-set::
.. tab-item:: MI355X and MI350X
:sync: MI355X
.. code-block:: shell
EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 6
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 5
.. tab-item:: MI300X
:sync: MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 3
.. container:: model-doc primus_pyt_train_deepseek-v2
Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
.. tab-set::
.. tab-item:: MI355X and MI350X
@@ -401,151 +371,55 @@ tweak some configurations (such as batch sizes).
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 6
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 16
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 5
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 10
.. tab-item:: MI300X
:sync: MI325X and MI300X
:sync: MI300X
.. code-block:: shell
EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
bash examples/run_pretrain.sh \
--metrics.enable_tensorboard false \
--profiling.enable_profiling false \
--training.batch_size 3
{% endfor %}
{% endfor %}
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 8
.. tab-item:: Standalone torchtitan benchmarking
{% for model_group in model_groups %}
{% for model in model_group.models %}
.. container:: model-doc {{ model.mad_tag }}
The following run commands are tailored to {{ model.model }}.
See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
.. rubric:: Download the Docker image and required packages
1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
To train DeepSeek V2 16B with FP8 precision, use the following command.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: MI355X and MI350X
:sync: MI355X
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 16
2. Run the Docker container.
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: MI325X
:sync: MI325X
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 8
Use these commands if you exit the ``training_env`` container and need to return to it.
.. tab-item:: MI300X
:sync: MI300X
.. code-block:: shell
.. code-block:: shell
docker start training_env
docker exec -it training_env bash
3. Navigate to the ``torchtitan`` workspace directory.
.. code-block:: shell
cd /workspace/torchtitan
.. rubric:: Download the tokenizer
1. The following benchmarking examples require downloading models and datasets
from Hugging Face. To ensure successful access to gated repos, set your
``HF_TOKEN``.
.. code-block:: shell
export HF_TOKEN=$your_personal_hugging_face_access_token
2. Download the tokenizer for your model.
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
python3 scripts/download_tokenizer.py \
--repo_id {{ model.model_repo }} \
--tokenizer_path "original" \
--hf_token=${HF_TOKEN}
.. rubric:: Pretraining examples
Run the training script with the appropriate configuration file.
For train with BF16 precicion, use the following command:
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
CONFIG_FILE={{ model.config_file.bf16 }} \
.run_train.sh
For train with BF16 precicion, use the following command:
.. container:: model-doc {{ model.mad_tag }}
.. code-block:: shell
CONFIG_FILE={{ model.config_file.fp8 }} \
.run_train.sh
EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
bash examples/run_pretrain.sh --training.batch_size 8
{% endfor %}
{% endfor %}
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
Further reading
===============

View File

@@ -27,12 +27,10 @@ training workloads:
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
{% set dockers = data.dockers %}
.. tab-set::
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. tab-item:: {{ data.docker.pull_tag }}
:sync: {{ data.docker.pull_tag }}
.. list-table::
:header-rows: 1
@@ -40,13 +38,12 @@ training workloads:
* - Software component
- Version
{% for component_name, component_version in docker.components.items() %}
{% for component_name, component_version in data.docker.components.items() %}
* - {{ component_name }}
- {{ component_version }}
{% endfor %}
{% endfor %}
.. _amd-pytorch-training-model-support-v259:
.. _amd-pytorch-training-model-support-v2510:
Supported models
================
@@ -88,7 +85,7 @@ one to get started.
</div>
</div>
.. _amd-pytorch-training-supported-training-modes-v259:
.. _amd-pytorch-training-supported-training-modes-v2510:
The following table lists supported training modes per model.
@@ -123,7 +120,7 @@ The following table lists supported training modes per model.
unlisted fine-tuning methods by using an existing file in the
``/workspace/torchtune/recipes/configs`` directory as a template.
.. _amd-pytorch-training-performance-measurements-v259:
.. _amd-pytorch-training-performance-measurements-v2510:
Performance measurements
========================
@@ -164,7 +161,7 @@ Run training
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
{% set dockers = data.dockers %}
{% set docker = data.docker %}
{% set model_groups = data.model_groups %}
Once the setup is complete, choose between two options to start benchmarking training:
@@ -179,7 +176,7 @@ Run training
.. container:: model-doc {{ model.mad_tag }}
The following run command is tailored to {{ model.model }}.
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
directory and install the required packages on the host machine.
@@ -217,7 +214,7 @@ Run training
.. container:: model-doc {{ model.mad_tag }}
The following commands are tailored to {{ model.model }}.
See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
{% endfor %}
{% endfor %}
@@ -226,42 +223,28 @@ Run training
1. Use the following command to pull the Docker image from Docker Hub.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker pull {{ docker.pull_tag }}
{% endfor %}
docker pull {{ docker.pull_tag }}
2. Launch the Docker container.
.. tab-set::
.. code-block:: shell
{% for supported_gpus, docker in dockers.items() %}
.. tab-item:: {{ supported_gpus }}
:sync: {{ supported_gpus }}
.. code-block:: shell
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
{% endfor %}
docker run -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME:$HOME \
-v $HOME/.ssh:/root/.ssh \
--shm-size 64G \
--name training_env \
{{ docker.pull_tag }}
Use these commands if you exit the ``training_env`` container and need to return to it.
@@ -419,11 +402,34 @@ Run training
.. container:: model-doc {{ model.mad_tag }}
.. rubric:: Pre-training
.. rubric:: Pretraining
To start the pre-training benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
{% if model.mad_tag == "pyt_train_dlrm" %}
1. Go to the DLRM directory.
.. code-block:: shell
cd /workspace/DLRMBenchmark
2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
run the following script.
.. code-block:: shell
./launch_training_single_node.sh
To run with MAD within the Docker container, use the following command.
.. code-block:: shell
./pytorch_benchmark_report.sh -t pretrain -m DLRM
{% else %}
.. code-block:: shell
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
@@ -466,6 +472,7 @@ Run training
* - ``$sequence_length``
- Sequence length for the language model.
- Between 2048 and 8192. 8192 by default.
{% endif %}
{% endif %}
{% set training_modes = model.training_modes %}
@@ -525,7 +532,7 @@ Run training
To start the fine-tuning benchmark, use the following command with the
appropriate options. See the following list of options and their descriptions.
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
.. code-block:: shell
@@ -590,7 +597,7 @@ Run training
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
.. _amd-pytorch-training-multinode-examples-v259:
.. _amd-pytorch-training-multinode-examples-v2510:
Multi-node training
-------------------
@@ -639,11 +646,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
Known issues
============
PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
Further reading
===============