From 8647ebcf769dce5dc670a2362ef2e8f12dadf166 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Thu, 4 Dec 2025 09:08:00 -0500 Subject: [PATCH] Update training Docker docs for Primus 25.10 (#5737) (cherry picked from commit fb644412d598fc1ee26977541205c471515b30a4) --- .../megatron-lm-benchmark-models.yaml | 32 +- .../megatron-lm-v25.9-benchmark-models.yaml | 53 + ...rimus-megatron-v25.9-benchmark-models.yaml | 65 + ...primus-pytorch-v25.9-benchmark-models.yaml | 39 + ...torch-training-v25.9-benchmark-models.yaml | 186 +++ .../primus-megatron-benchmark-models.yaml | 31 +- .../primus-pytorch-benchmark-models.yaml | 63 +- .../pytorch-training-benchmark-models.yaml | 53 +- .../training/benchmark-docker/megatron-lm.rst | 122 +- .../previous-versions/megatron-lm-history.rst | 15 +- .../previous-versions/megatron-lm-v25.9.rst | 1044 +++++++++++++++++ .../primus-megatron-v25.9.rst | 1019 ++++++++++++++++ .../primus-pytorch-v25.9.rst | 574 +++++++++ .../pytorch-training-history.rst | 15 +- .../pytorch-training-v25.9.rst | 667 +++++++++++ .../benchmark-docker/primus-megatron.rst | 304 +++-- .../benchmark-docker/primus-pytorch.rst | 332 ++---- .../benchmark-docker/pytorch-training.rst | 104 +- 18 files changed, 4158 insertions(+), 560 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst diff --git a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml index 1bf411207..8cb0fd12e 100644 --- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml @@ -1,21 +1,17 @@ -dockers: - MI355X and MI350X: - pull_tag: rocm/megatron-lm:v25.9_gfx950 - docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 - components: &docker_components - ROCm: 7.0.0 - Primus: aab4234 - PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - Python: "3.10" - Transformer Engine: 2.2.0.dev0+54dd2bdc - Flash Attention: 2.8.3 - hipBLASLt: 911283acd1 - Triton: 3.4.0+rocm7.0.0.git56765e8c - RCCL: 2.26.6 - MI325X and MI300X: - pull_tag: rocm/megatron-lm:v25.9_gfx942 - docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 - components: *docker_components +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: + ROCm: 7.1.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 + Triton: 3.4.0 + RCCL: 2.27.7 model_groups: - group: Meta Llama tag: llama diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml new file mode 100644 index 000000000..1bf411207 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml @@ -0,0 +1,53 @@ +dockers: + MI355X and MI350X: + pull_tag: rocm/megatron-lm:v25.9_gfx950 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: &docker_components + ROCm: 7.0.0 + Primus: aab4234 + PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 + Python: "3.10" + Transformer Engine: 2.2.0.dev0+54dd2bdc + Flash Attention: 2.8.3 + hipBLASLt: 911283acd1 + Triton: 3.4.0+rocm7.0.0.git56765e8c + RCCL: 2.26.6 + MI325X and MI300X: + pull_tag: rocm/megatron-lm:v25.9_gfx942 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 + components: *docker_components +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: pyt_megatron_lm_train_llama-3.3-70b + - model: Llama 3.1 8B + mad_tag: pyt_megatron_lm_train_llama-3.1-8b + - model: Llama 3.1 70B + mad_tag: pyt_megatron_lm_train_llama-3.1-70b + - model: Llama 2 7B + mad_tag: pyt_megatron_lm_train_llama-2-7b + - model: Llama 2 70B + mad_tag: pyt_megatron_lm_train_llama-2-70b + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy + - model: DeepSeek-V2-Lite + mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: pyt_megatron_lm_train_mixtral-8x7b + - model: Mixtral 8x22B (proxy) + mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: pyt_megatron_lm_train_qwen2.5-7b + - model: Qwen 2.5 72B + mad_tag: pyt_megatron_lm_train_qwen2.5-72b diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml new file mode 100644 index 000000000..386538cf1 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml @@ -0,0 +1,65 @@ +dockers: + MI355X and MI350X: + pull_tag: rocm/primus:v25.9_gfx950 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: &docker_components + ROCm: 7.0.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 + Python: "3.10" + Transformer Engine: 2.2.0.dev0+54dd2bdc + Flash Attention: 2.8.3 + hipBLASLt: 911283acd1 + Triton: 3.4.0+rocm7.0.0.git56765e8c + RCCL: 2.26.6 + MI325X and MI300X: + pull_tag: rocm/primus:v25.9_gfx942 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 + components: *docker_components +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b + config_name: llama3.3_70B-pretrain.yaml + - model: Llama 3.1 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b + config_name: llama3.1_70B-pretrain.yaml + - model: Llama 3.1 8B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b + config_name: llama3.1_8B-pretrain.yaml + - model: Llama 2 7B + mad_tag: primus_pyt_megatron_lm_train_llama-2-7b + config_name: llama2_7B-pretrain.yaml + - model: Llama 2 70B + mad_tag: primus_pyt_megatron_lm_train_llama-2-70b + config_name: llama2_70B-pretrain.yaml + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy + config_name: deepseek_v3-pretrain.yaml + - model: DeepSeek-V2-Lite + mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + config_name: deepseek_v2_lite-pretrain.yaml + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b + config_name: mixtral_8x7B_v0.1-pretrain.yaml + - model: Mixtral 8x22B (proxy) + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + config_name: mixtral_8x22B_v0.1-pretrain.yaml + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b + config_name: primus_qwen2.5_7B-pretrain.yaml + - model: Qwen 2.5 72B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b + config_name: qwen2.5_72B-pretrain.yaml diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml new file mode 100644 index 000000000..4a4c57a12 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml @@ -0,0 +1,39 @@ +dockers: + MI355X and MI350X: + pull_tag: rocm/primus:v25.9_gfx950 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: &docker_components + ROCm: 7.0.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 + Python: "3.10" + Transformer Engine: 2.2.0.dev0+54dd2bdc + Flash Attention: 2.8.3 + hipBLASLt: 911283acd1 + Triton: 3.4.0+rocm7.0.0.git56765e8c + RCCL: 2.26.6 + MI325X and MI300X: + pull_tag: rocm/primus:v25.9_gfx942 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 + components: *docker_components +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: primus_pyt_train_llama-3.1-8b + model_repo: meta-llama/Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + config_file: + bf16: "./llama3_8b_fsdp_bf16.toml" + fp8: "./llama3_8b_fsdp_fp8.toml" + - model: Llama 3.1 70B + mad_tag: primus_pyt_train_llama-3.1-70b + model_repo: meta-llama/Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B + precision: BF16 + config_file: + bf16: "./llama3_70b_fsdp_bf16.toml" + fp8: "./llama3_70b_fsdp_fp8.toml" diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml new file mode 100644 index 000000000..05c77d799 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml @@ -0,0 +1,186 @@ +dockers: + MI355X and MI350X: + pull_tag: rocm/pytorch-training:v25.9_gfx950 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: &docker_components + ROCm: 7.0.0 + Primus: aab4234 + PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 + Python: "3.10" + Transformer Engine: 2.2.0.dev0+54dd2bdc + Flash Attention: 2.8.3 + hipBLASLt: 911283acd1 + Triton: 3.4.0+rocm7.0.0.git56765e8c + RCCL: 2.26.6 + MI325X and MI300X: + pull_tag: rocm/pytorch-training:v25.9_gfx942 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 + components: *docker_components +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 4 Scout 17B-16E + mad_tag: pyt_train_llama-4-scout-17b-16e + model_repo: Llama-4-17B_16E + url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.3 70B + mad_tag: pyt_train_llama-3.3-70b + model_repo: Llama-3.3-70B + url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 3.2 1B + mad_tag: pyt_train_llama-3.2-1b + model_repo: Llama-3.2-1B + url: https://huggingface.co/meta-llama/Llama-3.2-1B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 3B + mad_tag: pyt_train_llama-3.2-3b + model_repo: Llama-3.2-3B + url: https://huggingface.co/meta-llama/Llama-3.2-3B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 Vision 11B + mad_tag: pyt_train_llama-3.2-vision-11b + model_repo: Llama-3.2-Vision-11B + url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.2 Vision 90B + mad_tag: pyt_train_llama-3.2-vision-90b + model_repo: Llama-3.2-Vision-90B + url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.1 8B + mad_tag: pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain] + - model: Llama 3.1 70B + mad_tag: pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: BF16 + training_modes: [pretrain, finetune_fw, finetune_lora] + - model: Llama 3.1 405B + mad_tag: pyt_train_llama-3.1-405b + model_repo: Llama-3.1-405B + url: https://huggingface.co/meta-llama/Llama-3.1-405B + precision: BF16 + training_modes: [finetune_qlora] + - model: Llama 3 8B + mad_tag: pyt_train_llama-3-8b + model_repo: Llama-3-8B + url: https://huggingface.co/meta-llama/Meta-Llama-3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3 70B + mad_tag: pyt_train_llama-3-70b + model_repo: Llama-3-70B + url: https://huggingface.co/meta-llama/Meta-Llama-3-70B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 7B + mad_tag: pyt_train_llama-2-7b + model_repo: Llama-2-7B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 2 13B + mad_tag: pyt_train_llama-2-13b + model_repo: Llama-2-13B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 70B + mad_tag: pyt_train_llama-2-70b + model_repo: Llama-2-70B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_lora, finetune_qlora] + - group: OpenAI + tag: openai + models: + - model: GPT OSS 20B + mad_tag: pyt_train_gpt_oss_20b + model_repo: GPT-OSS-20B + url: https://huggingface.co/openai/gpt-oss-20b + precision: BF16 + training_modes: [HF_finetune_lora] + - model: GPT OSS 120B + mad_tag: pyt_train_gpt_oss_120b + model_repo: GPT-OSS-120B + url: https://huggingface.co/openai/gpt-oss-120b + precision: BF16 + training_modes: [HF_finetune_lora] + - group: Qwen + tag: qwen + models: + - model: Qwen 3 8B + mad_tag: pyt_train_qwen3-8b + model_repo: Qwen3-8B + url: https://huggingface.co/Qwen/Qwen3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 3 32B + mad_tag: pyt_train_qwen3-32b + model_repo: Qwen3-32 + url: https://huggingface.co/Qwen/Qwen3-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 32B + mad_tag: pyt_train_qwen2.5-32b + model_repo: Qwen2.5-32B + url: https://huggingface.co/Qwen/Qwen2.5-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 72B + mad_tag: pyt_train_qwen2.5-72b + model_repo: Qwen2.5-72B + url: https://huggingface.co/Qwen/Qwen2.5-72B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2 1.5B + mad_tag: pyt_train_qwen2-1.5b + model_repo: Qwen2-1.5B + url: https://huggingface.co/Qwen/Qwen2-1.5B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 2 7B + mad_tag: pyt_train_qwen2-7b + model_repo: Qwen2-7B + url: https://huggingface.co/Qwen/Qwen2-7B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - group: Stable Diffusion + tag: sd + models: + - model: Stable Diffusion XL + mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning + model_repo: SDXL + url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 + precision: BF16 + training_modes: [posttrain-p] + - group: Flux + tag: flux + models: + - model: FLUX.1-dev + mad_tag: pyt_train_flux + model_repo: Flux + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + precision: BF16 + training_modes: [posttrain-p] + - group: NCF + tag: ncf + models: + - model: NCF + mad_tag: pyt_ncf_training + model_repo: + url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF + precision: FP32 diff --git a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml index 386538cf1..bd8dc5356 100644 --- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml @@ -1,22 +1,15 @@ -dockers: - MI355X and MI350X: - pull_tag: rocm/primus:v25.9_gfx950 - docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 - components: &docker_components - ROCm: 7.0.0 - Primus: 0.3.0 - Primus Turbo: 0.1.1 - PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - Python: "3.10" - Transformer Engine: 2.2.0.dev0+54dd2bdc - Flash Attention: 2.8.3 - hipBLASLt: 911283acd1 - Triton: 3.4.0+rocm7.0.0.git56765e8c - RCCL: 2.26.6 - MI325X and MI300X: - pull_tag: rocm/primus:v25.9_gfx942 - docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 - components: *docker_components +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: + ROCm: 7.1.0 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 + Triton: 3.4.0 + RCCL: 2.27.7 model_groups: - group: Meta Llama tag: llama diff --git a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml index 4a4c57a12..3db8a411b 100644 --- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml @@ -1,39 +1,32 @@ -dockers: - MI355X and MI350X: - pull_tag: rocm/primus:v25.9_gfx950 - docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 - components: &docker_components - ROCm: 7.0.0 - Primus: 0.3.0 - Primus Turbo: 0.1.1 - PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - Python: "3.10" - Transformer Engine: 2.2.0.dev0+54dd2bdc - Flash Attention: 2.8.3 - hipBLASLt: 911283acd1 - Triton: 3.4.0+rocm7.0.0.git56765e8c - RCCL: 2.26.6 - MI325X and MI300X: - pull_tag: rocm/primus:v25.9_gfx942 - docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 - components: *docker_components +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: + ROCm: 7.1.0 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 model_groups: - group: Meta Llama tag: llama models: - - model: Llama 3.1 8B - mad_tag: primus_pyt_train_llama-3.1-8b - model_repo: meta-llama/Llama-3.1-8B - url: https://huggingface.co/meta-llama/Llama-3.1-8B - precision: BF16 - config_file: - bf16: "./llama3_8b_fsdp_bf16.toml" - fp8: "./llama3_8b_fsdp_fp8.toml" - - model: Llama 3.1 70B - mad_tag: primus_pyt_train_llama-3.1-70b - model_repo: meta-llama/Llama-3.1-70B - url: https://huggingface.co/meta-llama/Llama-3.1-70B - precision: BF16 - config_file: - bf16: "./llama3_70b_fsdp_bf16.toml" - fp8: "./llama3_70b_fsdp_fp8.toml" + - model: Llama 3.1 8B + mad_tag: primus_pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + - model: Llama 3.1 70B + mad_tag: primus_pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B + precision: BF16 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek V2 16B + mad_tag: primus_pyt_train_deepseek-v2 + model_repo: DeepSeek-V2 + url: https://huggingface.co/deepseek-ai/DeepSeek-V2 + precision: BF16 diff --git a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml index 05c77d799..b037f5087 100644 --- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml @@ -1,21 +1,15 @@ -dockers: - MI355X and MI350X: - pull_tag: rocm/pytorch-training:v25.9_gfx950 - docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 - components: &docker_components - ROCm: 7.0.0 - Primus: aab4234 - PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - Python: "3.10" - Transformer Engine: 2.2.0.dev0+54dd2bdc - Flash Attention: 2.8.3 - hipBLASLt: 911283acd1 - Triton: 3.4.0+rocm7.0.0.git56765e8c - RCCL: 2.26.6 - MI325X and MI300X: - pull_tag: rocm/pytorch-training:v25.9_gfx942 - docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357 - components: *docker_components +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6 + components: + ROCm: 7.1.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 model_groups: - group: Meta Llama tag: llama @@ -119,6 +113,15 @@ model_groups: url: https://huggingface.co/openai/gpt-oss-120b precision: BF16 training_modes: [HF_finetune_lora] + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek V2 16B + mad_tag: primus_pyt_train_deepseek-v2 + model_repo: DeepSeek-V2 + url: https://huggingface.co/deepseek-ai/DeepSeek-V2 + precision: BF16 + training_modes: [pretrain] - group: Qwen tag: qwen models: @@ -166,7 +169,7 @@ model_groups: model_repo: SDXL url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 precision: BF16 - training_modes: [posttrain-p] + training_modes: [posttrain] - group: Flux tag: flux models: @@ -175,12 +178,20 @@ model_groups: model_repo: Flux url: https://huggingface.co/black-forest-labs/FLUX.1-dev precision: BF16 - training_modes: [posttrain-p] + training_modes: [posttrain] - group: NCF tag: ncf models: - model: NCF mad_tag: pyt_ncf_training model_repo: - url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF + url: https://github.com/ROCm/FluxBenchmark precision: FP32 + - group: DLRM + tag: dlrm + models: + - model: DLRM v2 + mad_tag: pyt_train_dlrm + model_repo: DLRM + url: https://github.com/AMD-AGI/DLRMBenchmark + training_modes: [pretrain] diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst index 6c8cf154f..bfd9ad3cc 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst @@ -36,12 +36,10 @@ accelerate training workloads: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml - {% set dockers = data.dockers %} .. tab-set:: - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} .. list-table:: :header-rows: 1 @@ -49,12 +47,12 @@ accelerate training workloads: * - Software component - Version - {% for component_name, component_version in docker.components.items() %} + {% for component_name, component_version in data.docker.components.items() %} * - {{ component_name }} - {{ component_version }} {% endfor %} - {% endfor %} - .. _amd-megatron-lm-model-support: + + .. _amd-megatron-lm-model-support-v2510: Supported models ================ @@ -99,7 +97,7 @@ accelerate training workloads: Some models, such as Llama, require an external license agreement through a third party (for example, Meta). -.. _amd-megatron-lm-performance-measurements: +.. _amd-megatron-lm-performance-measurements-v2510: Performance measurements ======================== @@ -131,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben `. This suite of tests will help you verify and fine-tune your system's configuration. -.. _mi300x-amd-megatron-lm-training: +.. _mi300x-amd-megatron-lm-training-v2510: Environment setup ================= @@ -140,52 +138,38 @@ Use the following instructions to set up the environment, configure the script t reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker image. -.. _amd-megatron-lm-requirements: +.. _amd-megatron-lm-requirements-v2510: Download the Docker image ------------------------- .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml - {% set dockers = data.dockers %} + {% set docker = data.docker %} 1. Use the following command to pull the Docker image from Docker Hub. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - {% endfor %} + docker pull {{ docker.pull_tag }} 2. Launch the Docker container. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker run -it \ - --device /dev/dri \ - --device /dev/kfd \ - --device /dev/infiniband \ - --network host --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v $HOME:$HOME \ - -v $HOME/.ssh:/root/.ssh \ - --shm-size 128G \ - --name megatron_training_env \ - {{ docker.pull_tag }} - {% endfor %} + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. @@ -206,7 +190,7 @@ Download the Docker image The Docker container hosts a verified commit of ``__. -.. _amd-megatron-lm-environment-setup: +.. _amd-megatron-lm-environment-setup-v2510: Configuration ============= @@ -216,39 +200,39 @@ Configuration Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. note:: - See :ref:`Key options ` for more information on configuration options. + See :ref:`Key options ` for more information on configuration options. Multi-node configuration ------------------------ @@ -256,7 +240,7 @@ Multi-node configuration Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands. -.. _amd-megatron-lm-tokenizer: +.. _amd-megatron-lm-tokenizer-v2510: Tokenizer --------- @@ -393,7 +377,7 @@ Download the dataset ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. Remember to either pre-download the tokenizer or setup Hugging Face access - otherwise when needed -- see the :ref:`Tokenizer ` section. + otherwise when needed -- see the :ref:`Tokenizer ` section. .. note:: @@ -495,15 +479,38 @@ Download the dataset Ensure that the files are accessible inside the Docker container. -.. _amd-megatron-lm-run-training: +.. _amd-megatron-lm-run-training-v2510: Run training ============ Use the following example commands to set up the environment, configure -:ref:`key options `, and run training on +:ref:`key options `, and run training on MI300X Series GPUs with the AMD Megatron-LM environment. +Before starting training, export the following environment variables. + +.. tab-set:: + + .. tab-item:: MI355X and MI350X + + .. code-block:: shell + + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NVTE_CK_USES_BWD_V3=1 + + .. tab-item:: MI325X and MI300X + + .. code-block:: shell + + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NVTE_CK_USES_BWD_V3=1 + + # Set this on MI325X/MI300X only + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + Single node training -------------------- @@ -913,7 +920,7 @@ Single node training RECOMPUTE_ACTIVATIONS=full \ CKPT_FORMAT=torch_dist -.. _amd-megatron-lm-multi-node-examples: +.. _amd-megatron-lm-multi-node-examples-v2510: Multi-node training examples ---------------------------- @@ -964,7 +971,7 @@ training on 16 nodes, try the following command: sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh -.. _amd-megatron-lm-benchmark-test-vars: +.. _amd-megatron-lm-benchmark-test-vars-v2510: Key options ----------- @@ -1029,11 +1036,6 @@ The benchmark tests support the following sets of variables. ``RECOMPUTE_NUM_LAYERS`` Number of layers used for checkpointing recompute. -Known issues -============ - -PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. - Previous versions ================= diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst index 1d3c4905b..1b70f9386 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst @@ -16,14 +16,23 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub ` + * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>` + * `Docker Hub `__ + + * - v25.9 - * ROCm 7.0.0 * Primus 0.3.0 * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - - * :doc:`Primus Megatron documentation <../primus-megatron>` - * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>` + * :doc:`Primus Megatron documentation ` + * :doc:`Megatron-LM (legacy) documentation ` * `Docker Hub (gfx950) `__ * `Docker Hub (gfx942) `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst new file mode 100644 index 000000000..7668c33b1 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst @@ -0,0 +1,1044 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +****************************************** +Training a model with Megatron-LM on ROCm +****************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../megatron-lm` for the latest version. + + For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`. + + Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow. + To learn how to migrate workloads from Megatron-LM to Primus with Megatron, + see :doc:`megatron-lm-primus-migration-guide`. + +The `Megatron-LM framework for ROCm `_ is +a specialized fork of the robust Megatron-LM, designed to enable efficient +training of large-scale language models on AMD GPUs. By leveraging AMD +Instinctâ„¢ GPUs, Megatron-LM delivers enhanced scalability, performance, and +resource utilization for AI workloads. It is +purpose-built to support models like Llama, DeepSeek, and Mixtral, +enabling developers to train next-generation AI models more +efficiently. + +AMD provides ready-to-use Docker images for MI355X, MI350X, MI325X, and MI300X +GPUs containing essential components, including PyTorch, ROCm libraries, and +Megatron-LM utilities. It contains the following software components to +accelerate training workloads: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + {% endfor %} + .. _amd-megatron-lm-model-support: + + Supported models + ================ + + The following models are supported for training performance benchmarking with Megatron-LM and ROCm + on AMD Instinct MI300X Series GPUs. + Some instructions, commands, and training recommendations in this documentation might + vary by model -- select one to get started. + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +.. _amd-megatron-lm-performance-measurements: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `__ +page provides reference throughput and latency measurements for training +popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `__ + only reflects the latest version of this training benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-megatron-lm-training: + +Environment setup +================= + +Use the following instructions to set up the environment, configure the script to train models, and +reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker +image. + +.. _amd-megatron-lm-requirements: + +Download the Docker image +------------------------- + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + 1. Use the following command to pull the Docker image from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + + 2. Launch the Docker container. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} + {% endfor %} + +3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start megatron_training_env + docker exec -it megatron_training_env bash + +4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support. + To roll back to using Megatron-LM, follow these steps: + + .. code-block:: shell + + cd /workspace/Megatron-LM/ + pip uninstall megatron-core + pip install -e . + +The Docker container hosts a verified commit of +``__. + +.. _amd-megatron-lm-environment-setup: + +Configuration +============= + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b + + Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Multi-node configuration +------------------------ + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands. + +.. _amd-megatron-lm-tokenizer: + +Tokenizer +--------- + +You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples. +If the tokenizer is not found, it'll be downloaded if publicly available. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + If you do not have Llama 3.3 tokenizer locally, you need to use your + personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer. + See `Llama-3.3-70B-Instruct + `_. After you are + authorized, use your ``HF_TOKEN`` to download the tokenizer and set the + variable ``TOKENIZER_MODEL`` to the tokenizer path. + + .. code-block:: shell + + export HF_TOKEN= + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-8B" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-70B" + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite" + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Download the Mixtral tokenizer. + + .. code-block:: shell + + mkdir tokenizer + cd tokenizer + export HF_TOKEN= + wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model + + Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL=tokenizer/tokenizer.model + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-7B" + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-72B" + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default + value is ``1`` for enabled. + + .. code-block:: bash + + MOCK_DATA=1 + +* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 + + DATA_PATH="/data/bookcorpus_text_sentence" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Download the dataset +^^^^^^^^^^^^^^^^^^^^ + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy + + For Llama models, use the `prepare_dataset.sh + `_ script + to prepare your dataset. + To download the dataset, set the ``DATASET`` variable to the dataset you'd + like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and + ``DATASET=bookcorpus``. + + .. code-block:: shell + + DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset + DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset + + ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. + Remember to either pre-download the tokenizer or setup Hugging Face access + otherwise when needed -- see the :ref:`Tokenizer ` section. + + .. note:: + + When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx`` + as in the following example: + + .. code-block:: shell + + DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir mixtral-datasets + cd mixtral-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/mixtral-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir -p temp/qwen-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/qwen-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. _amd-megatron-lm-run-training: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +MI300X Series GPUs with the AMD Megatron-LM environment. + +Single node training +-------------------- + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MBS=2 \ + BS=16 \ + TE_FP8=0 \ + TP=1 \ + PP=1 \ + FSDP=1 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=512 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=10 \ + GEMM_TUNING=0 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + For Llama 3.1 8B BF16, use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=512 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=10 \ + GEMM_TUNING=1 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + + To run the training on a single node for Llama 3.1 70B FP8, use the + following command. + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + FP8_WEIGHT_TRANSPOSE_CACHE=0 \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + NUM_LAYERS=40 \ + bash examples/llama/train_llama3.sh + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b + + To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + For Llama 2 7B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-2-70b + + To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=7 \ + BS=56 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + FORCE_BALANCE=true \ + RUN_ENV=cluster \ + MODEL_SIZE=671B \ + TRAIN_ITERS=50 \ + SEQ_LEN=4096 \ + NUM_LAYERS=3 \ + MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \ + PR=bf16 \ + TP=1 PP=1 ETP=1 EP=8 \ + GEMM_TUNING=1 \ + NVTE_CK_USES_BWD_V3=1 \ + USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \ + GPT_LAYER_IN_TE=true \ + bash examples/deepseek_v3/train_deepseekv3.sh + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + GEMM_TUNING=1 \ + PR=bf16 \ + MBS=4 \ + AC=none \ + SEQ_LEN=4096 \ + PAD_LEN=4096 \ + TRAIN_ITERS=20 \ + bash examples/deepseek_v2/train_deepseekv2.sh + + .. note:: + + Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault + for large iterations. + For stability, it's recommended to use Primus for this workload. + See :doc:`../primus-megatron`. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=0 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=none \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=4096 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x7B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=4 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=full \ + NUM_LAYERS=4 \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=8192 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x22B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=0 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + + For FP8, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=1 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + FSDP=1 \ + CP=1 \ + PP=1 \ + MBS=3 \ + BS=24 \ + TE_FP8=0 \ + MODEL_SIZE=72 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-72B \ + RECOMPUTE_ACTIVATIONS=full \ + CKPT_FORMAT=torch_dist + +.. _amd-megatron-lm-multi-node-examples: + +Multi-node training examples +---------------------------- + +To run training on multiple nodes, launch the Docker container on each node. +For example, for Llama 3 using a two node setup (``NODE0`` as the master node), +use these commands. + +* On the master node ``NODE0``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=0 \ + bash examples/llama/train_llama3.sh + +* On the worker node ``NODE1``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=1 \ + bash examples/llama/train_llama3.sh + +Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is +provided in +``__ to +enable training at scale under a SLURM environment. For example, to run +training on 16 nodes, try the following command: + +.. code-block:: shell + + sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh + +.. _amd-megatron-lm-benchmark-test-vars: + +Key options +----------- + +The benchmark tests support the following sets of variables. + +``TEE_OUTPUT`` + ``1`` to enable training logs or ``0`` to disable. + +``TE_FP8`` + ``0`` for B16 or ``1`` for FP8 -- ``0`` by default. + +``GEMM_TUNING`` + ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels. + +``USE_FLASH_ATTN`` + ``1`` to enable Flash Attention. + +``FSDP`` + ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``, + ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled. + +``ENABLE_PROFILING`` + ``1`` to enable PyTorch profiling for performance analysis. + +``transformer-impl`` + ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE. + +``MODEL_SIZE`` + ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example. + +``TOTAL_ITERS`` + The total number of iterations -- ``10`` by default. + +``MOCK_DATA`` + ``1`` to use mock data or ``0`` to use real data you provide. + +``MBS`` + Micro batch size. + +``BS`` + Global batch size. + +``TP`` / ``TP_SIZE`` + Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on. + +``EP`` / ``EP_SIZE`` + Expert parallel for MoE models. + +``SEQ_LENGTH`` + Input sequence length. + +``PR`` + Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs. + +``AC`` + Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default. + +``NUM_LAYERS`` + Use reduced number of layers as a proxy model. + +``RECOMPUTE_NUM_LAYERS`` + Number of layers used for checkpointing recompute. + +Known issues +============ + +PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst new file mode 100644 index 000000000..bc544ca4a --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst @@ -0,0 +1,1019 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +******************************************** +Training a model with Primus and Megatron-LM +******************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../primus-megatron` for the latest version. + +`Primus `__ is a unified and flexible +training framework for AMD Instinct GPUs designed to support multiple training +engine backends -- including Megatron -- to deliver scalable, high-performance +model training. Performance acceleration is powered by `Primus Turbo +`__ and ROCm libraries. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`. + + Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM + training <../megatron-lm>` workflow. To learn how to migrate workloads from + Megatron-LM to Primus with Megatron, see + :doc:`megatron-lm-primus-migration-guide`. + +AMD provides a ready-to-use Docker images for MI355X, MI350X, +MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and +Megatron-LM. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + {% endfor %} + +.. _amd-primus-megatron-lm-model-support-v259: + +Supported models +================ + +The following models are pre-optimized for performance on AMD Instinct GPUs. +Some instructions, commands, and training examples in this documentation +might vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-primus-megatron-lm-training-v259: + +Environment setup +================= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + Use the following instructions to set up the environment, configure the script to train models, and + reproduce the benchmark results on AMD Instinct GPUs. + +.. _amd-primus-megatron-lm-requirements-v259: + +Pull the Docker image + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + + 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + + 2. Launch the Docker container. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + --shm-size 128G \ + --name primus_training_env \ + {{ docker.pull_tag }} + {% endfor %} + +3. Use these commands if you exit the ``primus_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start primus_training_env + docker exec -it primus_training_env bash + +The Docker container hosts verified commit ``e16b27b`` of the `Primus +`__ repository. + +.. _amd-primus-megatron-lm-environment-setup-v259: + +Configuration +============= + +Primus defines a training configuration in YAML for each model in +`examples/megatron/configs `__. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{ model.mad_tag }} + + For example, to update training parameters for {{ model.model }}, you can + update ``examples/megatron/configs/{{ model.config_name }}``. Training + configuration YAML files for other models follow this naming convention. + + {% endfor %} + {% endfor %} + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default + value is ``true`` for enabled. + + .. code-block:: yaml + + mock_data: true + +* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset. + + .. code-block:: bash + + mock_data: false + train_data_path: /path/to/your/dataset + + Ensure that the files are accessible inside the Docker container. + +.. _amd-primus-megatron-lm-tokenizer-v259: + +Tokenizer +--------- + +Set the ``HF_TOKEN`` environment variable with +right permissions to access the tokenizer for each model. + +.. code-block:: bash + + # Export your HF_TOKEN in the workspace + export HF_TOKEN= + +.. note:: + + In Primus, each model uses a tokenizer from Hugging Face. For example, Llama + 3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and + ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model + `__ + definition. + +.. _amd-primus-megatron-lm-run-training-v259: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +AMD Instinct GPUs using Primus with the Megatron backend. + +Single node training +-------------------- + +To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command: + +.. code-block:: shell + + pip install -r requirements.txt + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.3 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run pre-training for Llama 3.3 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 6 \ + --global_batch_size 48 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 2 \ + --global_batch_size 16 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 8B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run pre-training for Llama 3.1 8B FP8, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --micro_batch_size 4 \ + --global_batch_size 512 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + For Llama 3.1 8B BF16, use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 512 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run pre-training for Llama 3.1 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 32 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + + To run the training on a single node for Llama 3.1 70B FP8, use the following command. + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --no_fp8_weight_transpose_cache true \ + --micro_batch_size 3 \ + --global_batch_size 24 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 40 \ + --fp8 hybrid \ + --no_fp8_weight_transpose_cache true + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 7B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run pre-training for Llama 2 7B FP8, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --micro_batch_size 13 \ + --global_batch_size 416 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + To run pre-training for Llama 2 7B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 10 \ + --global_batch_size 640 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run pre-training for Llama 2 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 17 \ + --global_batch_size 272 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to DeepSeek-V3. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 3 \ + --moe_layer_freq 1 \ + --train_iters 50 \ + --micro_batch_size 8 \ + --global_batch_size 64 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 3 \ + --moe_layer_freq 1 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to DeepSeek-V2-Lite. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 12 \ + --global_batch_size 768 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --global_batch_size 256 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Mixtral 8x7B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 256 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Mixtral 8x22B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 4 \ + --pipeline_model_parallel_size 1 \ + --micro_batch_size 2 \ + --global_batch_size 16 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 4 \ + --pipeline_model_parallel_size 1 \ + --micro_batch_size 1 \ + --global_batch_size 16 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Qwen 2.5 7B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 16 \ + --global_batch_size 768 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + + For FP8, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + --micro_batch_size 20 \ + --global_batch_size 800 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Qwen 2.5 72B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 16 \ + --global_batch_size 256 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + +.. _amd-primus-megatron-multi-node-examples-v259: + +Multi-node training examples +---------------------------- + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. + +To run training on multiple nodes, you can use the +`run_slurm_pretrain.sh `__ +to launch the multi-node workload. Use the following steps to setup your environment: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git + cd Primus + git checkout e16b27b + + export DOCKER_IMAGE={{ docker.pull_tag }} + export HF_TOKEN= + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NCCL_IB_HCA= # specify which RDMA interfaces to use for communication + export NCCL_SOCKET_IFNAME= # your Network Interface + export GLOO_SOCKET_IFNAME= # your Network Interface + export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE + {% endfor %} + +.. note:: + + * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container. + * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster. + * To find your network interface, you can use ``ip a``. + * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices. + * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate. + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 8B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Llama 3.1 8B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --global_batch_size 1024 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 7B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Llama 2 7B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --global_batch_size 2048 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Llama 3.1 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.1 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Llama 2 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 10 \ + --global_batch_size 640 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 2 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 1536 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.3 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Llama 3.3 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.3 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Mixtral 8x7B BF16 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 256 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + + To train Qwen2.5 72B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 8 \ + --global_batch_size 512 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + +.. _amd-primus-megatron-lm-benchmark-test-vars-v259: + +Key options +----------- + +The following are key options to take note of + +fp8 + ``hybrid`` enables FP8 GEMMs. + +use_torch_fsdp2 + ``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled, + set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``. + +profile + To enable PyTorch profiling, set these parameters: + + .. code-block:: yaml + + profile: true + use_pytorch_profiler: true + profile_step_end: 7 + profile_step_start: 6 + +train_iters + The total number of iterations (default: 50). + +mock_data + True by default. + +micro_batch_size + Micro batch size. + +global_batch_size + Global batch size. + +recompute_granularity + For activation checkpointing. + +num_layers + For using a reduced number of layers as with proxy models. + +Known issues +============ + +PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. + +Further reading +=============== + +- For an introduction to Primus, see `Primus: A Lightweight, Unified Training + Framework for Large Models on AMD GPUs `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. + +This training environment now uses Primus with Megatron as the primary +configuration. Limited support for the legacy ROCm Megatron-LM is still +available; see the :doc:`../megatron-lm` documentation. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst new file mode 100644 index 000000000..964c3db27 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst @@ -0,0 +1,574 @@ +:orphan: + +.. meta:: + :description: How to train a model using PyTorch for ROCm. + :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker + +**************************************** +Training a model with Primus and PyTorch +**************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Primus PyTorch training + performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version. + +`Primus `__ is a unified and flexible +LLM training framework designed to streamline training. It streamlines LLM +training on AMD Instinct GPUs using a modular, reproducible configuration paradigm. +Primus now supports the PyTorch torchtitan backend. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including torchtitan and :doc:`Megatron-LM <../primus-megatron>`. + + Primus with the PyTorch torchtitan backend is designed to replace the + :doc:`ROCm PyTorch training <../pytorch-training>` workflow. See + :doc:`../pytorch-training` to see steps to run workloads without Primus. + +AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and +MI300X GPUs containing essential components for Primus and PyTorch training +with Primus Turbo optimizations. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + {% endfor %} + +.. _amd-primus-pytorch-model-support-v259: + +Supported models +================ + +The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs. +Some instructions, commands, and training recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. seealso:: + + For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models, + see the documentation :doc:`../pytorch-training` (without Primus) + +.. _amd-primus-pytorch-performance-measurements-v259: + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +This Docker image is optimized for specific model configurations outlined +below. Performance can vary for other training workloads, as AMD +doesn’t test configurations and run conditions outside those described. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + + Use the following command to pull the Docker image from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + +Run training +============ + +Once the setup is complete, choose between the following two workflows to start benchmarking training. +For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus). +For best performance on MI325X, MI350X, and MI355X GPUs, you might need to +tweak some configurations (such as batch sizes). + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% set model_groups = data.model_groups %} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run command is tailored to {{ model.model }}. + See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one node with the {{ model.precision }} data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the + model are collected in ``~/MAD/perf.csv``. + + .. note:: + + Currently, Primus torchtitan models are run with Primus Turbo + enabled for enhanced performance. To disable Primus Turbo, + modify respective configuration file + ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``. + + {% endfor %} + {% endfor %} + + .. tab-item:: Primus benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run commands are tailored to {{ model.model }}. + See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. + + .. rubric:: Download the Docker image and required packages + + 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + + 2. Run the Docker container. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} + {% endfor %} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + .. rubric:: Prepare training datasets and dependencies + + The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + .. rubric:: Pretraining + + To get started, navigate to the ``Primus`` directory in your container. + + .. code-block:: + + cd /workspace/Primus + + Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script + included with Primus with the appropriate options. + + .. rubric:: Benchmarking examples + + .. container:: model-doc primus_pyt_train_llama-3.1-8b + + Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 5 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 6 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 4 + + + To train Llama 3.1 8B with FP8 precision, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 8 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 7 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 5 + + .. container:: model-doc primus_pyt_train_llama-3.1-70b + + Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 8 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 6 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 4 + + To train Llama 3.1 70B with FP8 precision, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 6 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 5 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --metrics.enable_tensorboard false \ + --profiling.enable_profiling false \ + --training.batch_size 3 + {% endfor %} + {% endfor %} + + .. tab-item:: Standalone torchtitan benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run commands are tailored to {{ model.model }}. + See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. + + .. rubric:: Download the Docker image and required packages + + 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + + 2. Run the Docker container. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} + {% endfor %} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + 3. Navigate to the ``torchtitan`` workspace directory. + + .. code-block:: shell + + cd /workspace/torchtitan + + .. rubric:: Download the tokenizer + + 1. The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + 2. Download the tokenizer for your model. + + .. container:: model-doc {{ model.mad_tag }} + + .. code-block:: shell + + python3 scripts/download_tokenizer.py \ + --repo_id {{ model.model_repo }} \ + --tokenizer_path "original" \ + --hf_token=${HF_TOKEN} + + .. rubric:: Pretraining examples + + Run the training script with the appropriate configuration file. + + For train with BF16 precicion, use the following command: + + .. container:: model-doc {{ model.mad_tag }} + + .. code-block:: shell + + CONFIG_FILE={{ model.config_file.bf16 }} \ + .run_train.sh + + For train with BF16 precicion, use the following command: + + .. container:: model-doc {{ model.mad_tag }} + + .. code-block:: shell + + CONFIG_FILE={{ model.config_file.fp8 }} \ + .run_train.sh + {% endfor %} + {% endfor %} + +Known issues +============ + +PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. + + +Further reading +=============== + +- For an introduction to Primus, see `Primus: A Lightweight, Unified Training + Framework for Large Models on AMD GPUs `__. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst index 5d0250179..d6487eb6f 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst @@ -16,14 +16,23 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub < - Components - Resources - * - v25.9 (latest) + * - v25.10 (latest) + - + * ROCm 7.1.0 + * PyTorch 2.10.0.dev20251112+rocm7.1 + - + * :doc:`Primus PyTorch Training documentation <../primus-pytorch>` + * :doc:`PyTorch training (legacy) documentation <../pytorch-training>` + * `Docker Hub `__ + + * - v25.9 - * ROCm 7.0.0 * Primus 0.3.0 * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7 - - * :doc:`Primus PyTorch Training documentation <../primus-pytorch>` - * :doc:`PyTorch training (legacy) documentation <../pytorch-training>` + * :doc:`Primus PyTorch Training documentation ` + * :doc:`PyTorch training (legacy) documentation ` * `Docker Hub (gfx950) `__ * `Docker Hub (gfx942) `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst new file mode 100644 index 000000000..6bafba855 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst @@ -0,0 +1,667 @@ +:orphan: + +.. meta:: + :description: How to train a model using PyTorch for ROCm. + :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker + +************************************** +Training a model with PyTorch on ROCm +************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm PyTorch training + performance benchmark documentation. See :doc:`../pytorch-training` for the latest version. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including torchtitan and :doc:`Megatron-LM <../primus-megatron>`. + + See :doc:`../primus-pytorch` for details. + +PyTorch is an open-source machine learning framework that is widely used for +model training with GPU-optimized components for transformer-based models. +The PyTorch for ROCm training Docker image provides a prebuilt optimized +environment for fine-tuning and pretraining a model on AMD Instinct MI325X +and MI300X GPUs. It includes the following software components to accelerate +training workloads: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + {% endfor %} + +.. _amd-pytorch-training-model-support-v259: + +Supported models +================ + +The following models are pre-optimized for performance on the AMD Instinct +MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and +training recommendations in this documentation might vary by model -- select +one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. _amd-pytorch-training-supported-training-modes-v259: + +The following table lists supported training modes per model. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. dropdown:: Supported training modes + + .. list-table:: + :header-rows: 1 + + * - Model + - Supported training modes + + {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if model.training_modes %} + * - {{ model.model }} + - ``{{ model.training_modes | join('``, ``') }}`` + + {% endif %} + {% endfor %} + {% endfor %} + + .. note:: + + Some model and fine-tuning combinations are not listed. This is + because the `upstream torchtune repository `__ + doesn't provide default YAML configurations for them. + For advanced usage, you can create a custom configuration to enable + unlisted fine-tuning methods by using an existing file in the + ``/workspace/torchtune/recipes/configs`` directory as a template. + +.. _amd-pytorch-training-performance-measurements-v259: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and latency measurements for training +popular AI models. + +.. note:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + should not be interpreted as the peak performance achievable by AMD + Instinct MI325X and MI300X GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +This Docker image is optimized for specific model configurations outlined +below. Performance can vary for other training workloads, as AMD +doesn’t test configurations and run conditions outside those described. + +Run training +============ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% set model_groups = data.model_groups %} + + Once the setup is complete, choose between two options to start benchmarking training: + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run command is tailored to {{ model.model }}. + See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one node with the {{ model.precision }} data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the + model are collected in ``~/MAD/perf.csv``. + + {% endfor %} + {% endfor %} + + .. tab-item:: Standalone benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following commands are tailored to {{ model.model }}. + See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model. + + {% endfor %} + {% endfor %} + + .. rubric:: Download the Docker image and required packages + + 1. Use the following command to pull the Docker image from Docker Hub. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + {% endfor %} + + 2. Launch the Docker container. + + .. tab-set:: + + {% for supported_gpus, docker in dockers.items() %} + .. tab-item:: {{ supported_gpus }} + :sync: {{ supported_gpus }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} + {% endfor %} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + 3. In the Docker container, clone the ``__ + repository and navigate to the benchmark scripts directory + ``/workspace/MAD/scripts/pytorch_train``. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/pytorch_train + + .. rubric:: Prepare training datasets and dependencies + + 1. The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + 2. Run the setup script to install libraries and datasets needed for benchmarking. + + .. code-block:: shell + + ./pytorch_benchmark_setup.sh + + .. container:: model-doc pyt_train_llama-3.1-8b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + .. container:: model-doc pyt_train_llama-3.1-70b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + * - ``torchdata`` + - `TorchData `__ + + * - ``tomli`` + - `Tomli `__ + + * - ``tiktoken`` + - `tiktoken `__ + + * - ``blobfile`` + - `blobfile `__ + + * - ``tabulate`` + - `tabulate `__ + + * - ``wandb`` + - `Weights & Biases `__ + + * - ``sentencepiece`` + - `SentencePiece `__ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `__ 2.18.0 + + .. container:: model-doc pyt_train_flux + + ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `__ 3.2.0 + + * - ``sentencepiece`` + - `SentencePiece `__ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `__ 2.18.0 + + * - ``csvkit`` + - `csvkit `__ 2.0.1 + + * - ``deepspeed`` + - `DeepSpeed `__ 0.16.2 + + * - ``diffusers`` + - `Hugging Face Diffusers `__ 0.31.0 + + * - ``GitPython`` + - `GitPython `__ 3.1.44 + + * - ``opencv-python-headless`` + - `opencv-python-headless `__ 4.10.0.84 + + * - ``peft`` + - `PEFT `__ 0.14.0 + + * - ``protobuf`` + - `Protocol Buffers `__ 5.29.2 + + * - ``pytest`` + - `PyTest `__ 8.3.4 + + * - ``python-dotenv`` + - `python-dotenv `__ 1.0.1 + + * - ``seaborn`` + - `Seaborn `__ 0.13.2 + + * - ``transformers`` + - `Transformers `__ 4.47.0 + + ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face: + + * `frank-chieng/chinese_architecture_siheyuan `__ + + {% for model_group in model_groups %} + {% for model in model_group.models %} + {% set training_modes = model.training_modes %} + {% set training_mode_descs = { + "pretrain": "Benchmark pre-training.", + "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision." + } %} + {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %} + {% if available_modes %} + + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Pre-training + + To start the pre-training benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + {% if model.mad_tag == "pyt_train_flux" %} + .. container:: model-doc {{ model.mad_tag }} + + .. note:: + + Currently, FLUX models are not supported out-of-the-box on this Docker. + To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6` + + Occasionally, downloading the Flux dataset might fail. In the event of this + error, manually download it from Hugging Face at + `black-forest-labs/FLUX.1-dev `_ + and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access + the required dataset. + {% endif %} + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %} + - Only Llama 3.1 8B supports FP8 precision. + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + {% endif %} + + {% set training_modes = model.training_modes %} + {% set training_mode_descs = { + "posttrain": "Benchmark post-training.", + } %} + {% set available_modes = training_modes | select("in", ["posttrain"]) | list %} + {% if available_modes %} + + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Post-training + + To start the post-training benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %} + - Only Llama 3.1 8B supports FP8 precision. + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + {% endif %} + + {% set training_mode_descs = { + "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).", + "finetune_lora": "LoRA fine-tuning (BF16 supported).", + "finetune_qlora": "QLoRA fine-tuning (BF16 supported).", + "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.", + } %} + {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %} + {% if available_modes %} + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Fine-tuning + + To start the fine-tuning benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + See :ref:`supported training modes `. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t $training_mode \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %} + - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %} + + * - ``$sequence_length`` + - Between 2048 and 16384. + - Sequence length for the language model. + + {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %} + .. note:: + + For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B), + use the following torchtune commit for compatibility: + + .. code-block:: shell + + git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e + + {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %} + .. note:: + + You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of + input tensor should be smaller than max_seq_len (4096)``. + This error indicates that an input sequence is longer than the model's maximum context window. + + Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096 + tokens in this case). You can resolve this by truncating the input or splitting + it into smaller chunks before passing it to the model. + + Note on reproducibility: The results in this guide are based on + commit ``b4c98ac`` from the upstream + ``__ repository. For the + latest updates, you can use the main branch. + + {% endif %} + {% endif %} + {% endfor %} + {% endfor %} + + .. rubric:: Benchmarking examples + + For examples of benchmarking commands, see ``__. + +.. _amd-pytorch-training-multinode-examples-v259: + +Multi-node training +------------------- + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands. + +Pre-training +~~~~~~~~~~~~ + +Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B. + +To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + # In the MAD repository + cd scripts/pytorch_train + sbatch run_slurm_train.sh + +Fine-tuning +~~~~~~~~~~~ + +Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B. + +To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + huggingface-cli login # Get access to HF Llama model space + huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally + # In the MAD repository + cd scripts/pytorch_train + sbatch Torchtune_Multinode.sh + +.. note:: + + Information regarding benchmark setup: + + * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``. + * You can adjust the torchtune `YAML configuration file + `__ + if you're using a different model. + * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``. + * Set the ``mounting_paths`` inside the SLURM script. + +Once the run is finished, you can find the log files in the ``result_torchtune/`` directory. + +Known issues +============ + +PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. + +Further reading +=============== + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst index 06cca9ed6..ed0c2a637 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst @@ -31,12 +31,10 @@ Megatron-LM. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml - {% set dockers = data.dockers %} .. tab-set:: - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} .. list-table:: :header-rows: 1 @@ -44,13 +42,12 @@ Megatron-LM. * - Software component - Version - {% for component_name, component_version in docker.components.items() %} + {% for component_name, component_version in data.docker.components.items() %} * - {{ component_name }} - {{ component_version }} {% endfor %} - {% endfor %} -.. _amd-primus-megatron-lm-model-support-v259: +.. _amd-primus-megatron-lm-model-support-v2510: Supported models ================ @@ -111,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben `. This suite of tests will help you verify and fine-tune your system's configuration. -.. _mi300x-amd-primus-megatron-lm-training-v259: +.. _mi300x-amd-primus-megatron-lm-training-v2510: Environment setup ================= @@ -121,63 +118,49 @@ Environment setup Use the following instructions to set up the environment, configure the script to train models, and reproduce the benchmark results on AMD Instinct GPUs. -.. _amd-primus-megatron-lm-requirements-v259: +.. _amd-primus-megatron-lm-requirements-v2510: Pull the Docker image .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml - {% set dockers = data.dockers %} + {% set docker = data.docker %} - 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + 1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - {% endfor %} + docker pull {{ docker.pull_tag }} 2. Launch the Docker container. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + --shm-size 128G \ + --name primus_training_env \ + {{ docker.pull_tag }} - .. code-block:: shell + Use these commands if you exit the ``primus_training_env`` container and need to return to it. - docker run -it \ - --device /dev/dri \ - --device /dev/kfd \ - --device /dev/infiniband \ - --network host --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v $HOME:$HOME \ - --shm-size 128G \ - --name primus_training_env \ - {{ docker.pull_tag }} - {% endfor %} + .. code-block:: shell -3. Use these commands if you exit the ``primus_training_env`` container and need to return to it. + docker start primus_training_env + docker exec -it primus_training_env bash - .. code-block:: shell +The Docker container hosts verified branch ``release/v25.10`` of the `Primus +`__ repository. - docker start primus_training_env - docker exec -it primus_training_env bash - -The Docker container hosts verified commit ``e16b27b`` of the `Primus -`__ repository. - -.. _amd-primus-megatron-lm-environment-setup-v259: +.. _amd-primus-megatron-lm-environment-setup-v2510: Configuration ============= @@ -224,7 +207,7 @@ You can use either mock data or real data for training. Ensure that the files are accessible inside the Docker container. -.. _amd-primus-megatron-lm-tokenizer-v259: +.. _amd-primus-megatron-lm-tokenizer-v2510: Tokenizer --------- @@ -245,7 +228,7 @@ right permissions to access the tokenizer for each model. `__ definition. -.. _amd-primus-megatron-lm-run-training-v259: +.. _amd-primus-megatron-lm-run-training-v2510: Run training ============ @@ -269,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.3 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run pre-training for Llama 3.3 70B BF16, run: @@ -280,7 +263,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 6 \ @@ -291,7 +274,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 2 \ @@ -301,7 +289,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 8B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run pre-training for Llama 3.1 8B FP8, run: @@ -312,7 +300,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid \ @@ -324,7 +312,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid @@ -338,7 +331,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 4 \ @@ -349,7 +342,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 @@ -357,7 +355,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run pre-training for Llama 3.1 70B BF16, run: @@ -368,7 +366,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 4 \ @@ -379,7 +377,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 @@ -398,7 +401,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid \ @@ -411,7 +414,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --num_layers 40 \ @@ -422,7 +430,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 7B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run pre-training for Llama 2 7B FP8, run: @@ -433,7 +441,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid \ @@ -445,7 +453,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid @@ -459,7 +472,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 10 \ @@ -470,7 +483,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 @@ -478,7 +496,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run pre-training for Llama 2 70B BF16, run: @@ -489,7 +507,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 17 \ @@ -500,7 +518,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 @@ -508,7 +531,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to DeepSeek-V3. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy, use the following command: @@ -520,7 +543,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \ bash examples/run_pretrain.sh \ --num_layers 3 \ --moe_layer_freq 1 \ @@ -533,17 +556,24 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \ bash examples/run_pretrain.sh \ --num_layers 3 \ --moe_layer_freq 1 \ + --micro_batch_size 3 \ + --global_batch_size 192 \ --train_iters 50 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b Once setup is complete, run the appropriate training command. The following run commands are tailored to DeepSeek-V2-Lite. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16, use the following command: @@ -555,7 +585,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 12 \ @@ -566,7 +596,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --global_batch_size 256 @@ -575,7 +610,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Mixtral 8x7B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run training on a single node for Mixtral 8x7B (MoE with expert parallel), use the following command: @@ -587,7 +622,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 4 \ @@ -598,7 +633,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 @@ -606,7 +646,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Mixtral 8x22B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy, use the following command: @@ -618,7 +658,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --num_layers 4 \ @@ -631,7 +671,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --num_layers 4 \ @@ -643,7 +688,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Qwen 2.5 7B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run training on a single node for Qwen 2.5 7B BF16, use the following command: @@ -655,7 +700,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 16 \ @@ -666,7 +711,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 @@ -679,7 +729,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid @@ -691,7 +741,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --fp8 hybrid @@ -700,7 +755,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Qwen 2.5 72B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To run the training on a single node for Qwen 2.5 72B BF16, use the following command. @@ -711,7 +766,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --micro_batch_size 16 \ @@ -722,11 +777,16 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 -.. _amd-primus-megatron-multi-node-examples-v259: +.. _amd-primus-megatron-multi-node-examples-v2510: Multi-node training examples ---------------------------- @@ -740,28 +800,27 @@ to launch the multi-node workload. Use the following steps to setup your environ .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml - {% set dockers = data.dockers %} - .. tab-set:: + {% set docker = data.docker %} + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git + cd Primus + git checkout release/v25.10 + git submodule update --init --recursive - .. code-block:: shell + export DOCKER_IMAGE={{ docker.pull_tag }} + export HF_TOKEN= + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NCCL_IB_HCA= # specify which RDMA interfaces to use for communication + export NCCL_SOCKET_IFNAME= # your Network Interface + export GLOO_SOCKET_IFNAME= # your Network Interface + export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE - git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git - cd Primus - git checkout e16b27b - - export DOCKER_IMAGE={{ docker.pull_tag }} - export HF_TOKEN= - export HSA_NO_SCRATCH_RECLAIM=1 - export NVTE_CK_USES_BWD_V3=1 - export NCCL_IB_HCA= # specify which RDMA interfaces to use for communication - export NCCL_SOCKET_IFNAME= # your Network Interface - export GLOO_SOCKET_IFNAME= # your Network Interface - export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE - {% endfor %} + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 .. note:: @@ -769,13 +828,13 @@ to launch the multi-node workload. Use the following steps to setup your environ * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster. * To find your network interface, you can use ``ip a``. * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices. - * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate. + * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate. .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 8B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Llama 3.1 8B FP8 on 8 nodes, run: @@ -793,7 +852,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 7B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Llama 2 7B FP8 on 8 nodes, run: @@ -811,7 +870,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Llama 3.1 70B FP8 on 8 nodes, run: @@ -843,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Llama 2 70B FP8 on 8 nodes, run: @@ -875,7 +934,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.3 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Llama 3.3 70B FP8 on 8 nodes, run: @@ -907,7 +966,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Mixtral 8x7B BF16 on 8 nodes, run: @@ -925,7 +984,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. To train Qwen2.5 72B FP8 on 8 nodes, run: @@ -942,7 +1001,7 @@ to launch the multi-node workload. Use the following steps to setup your environ --no_fp8_weight_transpose_cache true \ --fp8 hybrid -.. _amd-primus-megatron-lm-benchmark-test-vars-v259: +.. _amd-primus-megatron-lm-benchmark-test-vars-v2510: Key options ----------- @@ -987,7 +1046,10 @@ num_layers Known issues ============ -PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. +DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error +due to a memory free issue. However, this does not impacts training runs. All +iterations, in this case 50, should have been completed before the exit and +the results should be available in the end. Further reading =============== diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst index d243800b8..046eb5dc5 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst @@ -29,12 +29,10 @@ with Primus Turbo optimizations. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml - {% set dockers = data.dockers %} .. tab-set:: - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} .. list-table:: :header-rows: 1 @@ -42,13 +40,12 @@ with Primus Turbo optimizations. * - Software component - Version - {% for component_name, component_version in docker.components.items() %} + {% for component_name, component_version in data.docker.components.items() %} * - {{ component_name }} - {{ component_version }} {% endfor %} - {% endfor %} -.. _amd-primus-pytorch-model-support-v259: +.. _amd-primus-pytorch-model-support-v2510: Supported models ================ @@ -67,7 +64,7 @@ vary by model -- select one to get started.
Model
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %}
@@ -94,7 +91,7 @@ vary by model -- select one to get started. For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models, see the documentation :doc:`pytorch-training` (without Primus) -.. _amd-primus-pytorch-performance-measurements-v259: +.. _amd-primus-pytorch-performance-measurements-v2510: System validation ================= @@ -120,20 +117,11 @@ Pull the Docker image .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml - {% set dockers = data.dockers %} - Use the following command to pull the Docker image from Docker Hub. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - {% endfor %} + docker pull {{ data.docker.pull_tag }} Run training ============ @@ -145,7 +133,7 @@ tweak some configurations (such as batch sizes). .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml - {% set dockers = data.dockers %} + {% set docker = data.docker %} {% set model_groups = data.model_groups %} .. tab-set:: @@ -158,7 +146,7 @@ tweak some configurations (such as batch sizes). .. container:: model-doc {{ model.mad_tag }} The following run command is tailored to {{ model.model }}. - See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. + See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -185,13 +173,6 @@ tweak some configurations (such as batch sizes). ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the model are collected in ``~/MAD/perf.csv``. - .. note:: - - Currently, Primus torchtitan models are run with Primus Turbo - enabled for enhanced performance. To disable Primus Turbo, - modify respective configuration file - ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``. - {% endfor %} {% endfor %} @@ -203,48 +184,34 @@ tweak some configurations (such as batch sizes). .. container:: model-doc {{ model.mad_tag }} The following run commands are tailored to {{ model.model }}. - See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. + See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. .. rubric:: Download the Docker image and required packages - 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + 1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - {% endfor %} + docker pull {{ docker.pull_tag }} 2. Run the Docker container. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker run -it \ - --device /dev/dri \ - --device /dev/kfd \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v $HOME:$HOME \ - -v $HOME/.ssh:/root/.ssh \ - --shm-size 64G \ - --name training_env \ - {{ docker.pull_tag }} - {% endfor %} + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} Use these commands if you exit the ``training_env`` container and need to return to it. @@ -283,37 +250,28 @@ tweak some configurations (such as batch sizes). .. tab-set:: .. tab-item:: MI355X and MI350X - :sync: MI355X and MI300X + :sync: MI355X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 5 + EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 6 .. tab-item:: MI325X :sync: MI325X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 6 + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 6 .. tab-item:: MI300X - :sync: MI325X and MI300X + :sync: MI300X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 4 + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 4 To train Llama 3.1 8B with FP8 precision, use the following command. @@ -321,37 +279,28 @@ tweak some configurations (such as batch sizes). .. tab-set:: .. tab-item:: MI355X and MI350X - :sync: MI355X and MI300X + :sync: MI355X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 8 + EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 8 .. tab-item:: MI325X :sync: MI325X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 7 + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 7 .. tab-item:: MI300X - :sync: MI325X and MI300X + :sync: MI300X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 5 + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 5 .. container:: model-doc primus_pyt_train_llama-3.1-70b @@ -364,36 +313,57 @@ tweak some configurations (such as batch sizes). .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 8 + EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 8 .. tab-item:: MI325X :sync: MI325X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 6 + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 6 .. tab-item:: MI300X - :sync: MI325X and MI300X + :sync: MI300X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 4 + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 4 To train Llama 3.1 70B with FP8 precision, use the following command. + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 6 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 5 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 3 + + .. container:: model-doc primus_pyt_train_deepseek-v2 + + Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan. + .. tab-set:: .. tab-item:: MI355X and MI350X @@ -401,151 +371,55 @@ tweak some configurations (such as batch sizes). .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 6 + EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 16 .. tab-item:: MI325X :sync: MI325X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 5 + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 10 .. tab-item:: MI300X - :sync: MI325X and MI300X + :sync: MI300X .. code-block:: shell - EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --metrics.enable_tensorboard false \ - --profiling.enable_profiling false \ - --training.batch_size 3 - {% endfor %} - {% endfor %} + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 8 - .. tab-item:: Standalone torchtitan benchmarking - - {% for model_group in model_groups %} - {% for model in model_group.models %} - - .. container:: model-doc {{ model.mad_tag }} - - The following run commands are tailored to {{ model.model }}. - See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model. - - .. rubric:: Download the Docker image and required packages - - 1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub. + To train DeepSeek V2 16B with FP8 precision, use the following command. .. tab-set:: - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: MI355X and MI350X + :sync: MI355X .. code-block:: shell - docker pull {{ docker.pull_tag }} - {% endfor %} + EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 16 - 2. Run the Docker container. - - .. tab-set:: - - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: MI325X + :sync: MI325X .. code-block:: shell - docker run -it \ - --device /dev/dri \ - --device /dev/kfd \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v $HOME:$HOME \ - -v $HOME/.ssh:/root/.ssh \ - --shm-size 64G \ - --name training_env \ - {{ docker.pull_tag }} - {% endfor %} + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 8 - Use these commands if you exit the ``training_env`` container and need to return to it. + .. tab-item:: MI300X + :sync: MI300X - .. code-block:: shell + .. code-block:: shell - docker start training_env - docker exec -it training_env bash - - 3. Navigate to the ``torchtitan`` workspace directory. - - .. code-block:: shell - - cd /workspace/torchtitan - - .. rubric:: Download the tokenizer - - 1. The following benchmarking examples require downloading models and datasets - from Hugging Face. To ensure successful access to gated repos, set your - ``HF_TOKEN``. - - .. code-block:: shell - - export HF_TOKEN=$your_personal_hugging_face_access_token - - 2. Download the tokenizer for your model. - - .. container:: model-doc {{ model.mad_tag }} - - .. code-block:: shell - - python3 scripts/download_tokenizer.py \ - --repo_id {{ model.model_repo }} \ - --tokenizer_path "original" \ - --hf_token=${HF_TOKEN} - - .. rubric:: Pretraining examples - - Run the training script with the appropriate configuration file. - - For train with BF16 precicion, use the following command: - - .. container:: model-doc {{ model.mad_tag }} - - .. code-block:: shell - - CONFIG_FILE={{ model.config_file.bf16 }} \ - .run_train.sh - - For train with BF16 precicion, use the following command: - - .. container:: model-doc {{ model.mad_tag }} - - .. code-block:: shell - - CONFIG_FILE={{ model.config_file.fp8 }} \ - .run_train.sh + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.batch_size 8 {% endfor %} {% endfor %} -Known issues -============ - -PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. - - Further reading =============== diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst index 782cc61b3..f1e8c7f09 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst @@ -27,12 +27,10 @@ training workloads: .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml - {% set dockers = data.dockers %} .. tab-set:: - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} .. list-table:: :header-rows: 1 @@ -40,13 +38,12 @@ training workloads: * - Software component - Version - {% for component_name, component_version in docker.components.items() %} + {% for component_name, component_version in data.docker.components.items() %} * - {{ component_name }} - {{ component_version }} {% endfor %} - {% endfor %} -.. _amd-pytorch-training-model-support-v259: +.. _amd-pytorch-training-model-support-v2510: Supported models ================ @@ -88,7 +85,7 @@ one to get started. -.. _amd-pytorch-training-supported-training-modes-v259: +.. _amd-pytorch-training-supported-training-modes-v2510: The following table lists supported training modes per model. @@ -123,7 +120,7 @@ The following table lists supported training modes per model. unlisted fine-tuning methods by using an existing file in the ``/workspace/torchtune/recipes/configs`` directory as a template. -.. _amd-pytorch-training-performance-measurements-v259: +.. _amd-pytorch-training-performance-measurements-v2510: Performance measurements ======================== @@ -164,7 +161,7 @@ Run training .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml - {% set dockers = data.dockers %} + {% set docker = data.docker %} {% set model_groups = data.model_groups %} Once the setup is complete, choose between two options to start benchmarking training: @@ -179,7 +176,7 @@ Run training .. container:: model-doc {{ model.mad_tag }} The following run command is tailored to {{ model.model }}. - See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model. + See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -217,7 +214,7 @@ Run training .. container:: model-doc {{ model.mad_tag }} The following commands are tailored to {{ model.model }}. - See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model. + See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. {% endfor %} {% endfor %} @@ -226,42 +223,28 @@ Run training 1. Use the following command to pull the Docker image from Docker Hub. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker pull {{ docker.pull_tag }} - {% endfor %} + docker pull {{ docker.pull_tag }} 2. Launch the Docker container. - .. tab-set:: + .. code-block:: shell - {% for supported_gpus, docker in dockers.items() %} - .. tab-item:: {{ supported_gpus }} - :sync: {{ supported_gpus }} - - .. code-block:: shell - - docker run -it \ - --device /dev/dri \ - --device /dev/kfd \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v $HOME:$HOME \ - -v $HOME/.ssh:/root/.ssh \ - --shm-size 64G \ - --name training_env \ - {{ docker.pull_tag }} - {% endfor %} + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} Use these commands if you exit the ``training_env`` container and need to return to it. @@ -419,11 +402,34 @@ Run training .. container:: model-doc {{ model.mad_tag }} - .. rubric:: Pre-training + .. rubric:: Pretraining To start the pre-training benchmark, use the following command with the appropriate options. See the following list of options and their descriptions. + {% if model.mad_tag == "pyt_train_dlrm" %} + + 1. Go to the DLRM directory. + + .. code-block:: shell + + cd /workspace/DLRMBenchmark + + 2. To run the single node training benchmark for DLRM-v2 with TF32 precision, + run the following script. + + .. code-block:: shell + + ./launch_training_single_node.sh + + To run with MAD within the Docker container, use the following command. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t pretrain -m DLRM + + {% else %} + .. code-block:: shell ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ @@ -466,6 +472,7 @@ Run training * - ``$sequence_length`` - Sequence length for the language model. - Between 2048 and 8192. 8192 by default. + {% endif %} {% endif %} {% set training_modes = model.training_modes %} @@ -525,7 +532,7 @@ Run training To start the fine-tuning benchmark, use the following command with the appropriate options. See the following list of options and their descriptions. - See :ref:`supported training modes `. + See :ref:`supported training modes `. .. code-block:: shell @@ -590,7 +597,7 @@ Run training For examples of benchmarking commands, see ``__. -.. _amd-pytorch-training-multinode-examples-v259: +.. _amd-pytorch-training-multinode-examples-v2510: Multi-node training ------------------- @@ -639,11 +646,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi Once the run is finished, you can find the log files in the ``result_torchtune/`` directory. -Known issues -============ - -PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled. - Further reading ===============