From e0b8ec4dfb6b8c059a94895051f2a8939d9ab150 Mon Sep 17 00:00:00 2001 From: peterjunpark Date: Mon, 29 Dec 2025 08:05:47 -0500 Subject: [PATCH] Update training docs for Primus/25.11 (#5819) * update conf and toc.yml.in * archive previous versions archive data files update anchors * primus pytorch: remove training batch size args * update primus megatron run cmds multi-node * update primus pytorch update * update update * update docker tag --- docs/conf.py | 1 + .../megatron-lm-v25.10-benchmark-models.yaml | 49 + ...imus-megatron-v25.10-benchmark-models.yaml | 58 + ...rimus-pytorch-v25.10-benchmark-models.yaml | 32 + ...orch-training-v25.10-benchmark-models.yaml | 197 +++ .../primus-megatron-benchmark-models.yaml | 2 +- .../primus-pytorch-benchmark-models.yaml | 10 +- .../training/benchmark-docker/megatron-lm.rst | 34 +- .../previous-versions/megatron-lm-history.rst | 11 +- .../previous-versions/megatron-lm-v25.10.rst | 1046 ++++++++++++++++ .../primus-megatron-v25.10.rst | 1081 +++++++++++++++++ .../primus-pytorch-v25.10.rst | 448 +++++++ .../pytorch-training-history.rst | 11 +- .../pytorch-training-v25.10.rst | 669 ++++++++++ .../benchmark-docker/primus-megatron.rst | 263 ++-- .../benchmark-docker/primus-pytorch.rst | 74 +- .../benchmark-docker/pytorch-training.rst | 14 +- docs/sphinx/_toc.yml.in | 6 + 18 files changed, 3755 insertions(+), 251 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst diff --git a/docs/conf.py b/docs/conf.py index a80645bd2..42d494cee 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -163,6 +163,7 @@ article_pages = [ {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml new file mode 100644 index 000000000..de815e915 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml @@ -0,0 +1,49 @@ +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 + components: + ROCm: 7.1.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 + Triton: 3.4.0 + RCCL: 2.27.7 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: pyt_megatron_lm_train_llama-3.3-70b + - model: Llama 3.1 8B + mad_tag: pyt_megatron_lm_train_llama-3.1-8b + - model: Llama 3.1 70B + mad_tag: pyt_megatron_lm_train_llama-3.1-70b + - model: Llama 2 7B + mad_tag: pyt_megatron_lm_train_llama-2-7b + - model: Llama 2 70B + mad_tag: pyt_megatron_lm_train_llama-2-70b + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy + - model: DeepSeek-V2-Lite + mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: pyt_megatron_lm_train_mixtral-8x7b + - model: Mixtral 8x22B (proxy) + mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: pyt_megatron_lm_train_qwen2.5-7b + - model: Qwen 2.5 72B + mad_tag: pyt_megatron_lm_train_qwen2.5-72b diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml new file mode 100644 index 000000000..852e1a970 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml @@ -0,0 +1,58 @@ +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 + components: + ROCm: 7.1.0 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 + Triton: 3.4.0 + RCCL: 2.27.7 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b + config_name: llama3.3_70B-pretrain.yaml + - model: Llama 3.1 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b + config_name: llama3.1_70B-pretrain.yaml + - model: Llama 3.1 8B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b + config_name: llama3.1_8B-pretrain.yaml + - model: Llama 2 7B + mad_tag: primus_pyt_megatron_lm_train_llama-2-7b + config_name: llama2_7B-pretrain.yaml + - model: Llama 2 70B + mad_tag: primus_pyt_megatron_lm_train_llama-2-70b + config_name: llama2_70B-pretrain.yaml + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy + config_name: deepseek_v3-pretrain.yaml + - model: DeepSeek-V2-Lite + mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + config_name: deepseek_v2_lite-pretrain.yaml + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b + config_name: mixtral_8x7B_v0.1-pretrain.yaml + - model: Mixtral 8x22B (proxy) + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + config_name: mixtral_8x22B_v0.1-pretrain.yaml + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b + config_name: primus_qwen2.5_7B-pretrain.yaml + - model: Qwen 2.5 72B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b + config_name: qwen2.5_72B-pretrain.yaml diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml new file mode 100644 index 000000000..6c0f09bd6 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml @@ -0,0 +1,32 @@ +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 + components: + ROCm: 7.1.0 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.1 8B + mad_tag: primus_pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + - model: Llama 3.1 70B + mad_tag: primus_pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B + precision: BF16 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek V2 16B + mad_tag: primus_pyt_train_deepseek-v2 + model_repo: DeepSeek-V2 + url: https://huggingface.co/deepseek-ai/DeepSeek-V2 + precision: BF16 diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml new file mode 100644 index 000000000..e9edb64a1 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml @@ -0,0 +1,197 @@ +docker: + pull_tag: rocm/primus:v25.10 + docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 + components: + ROCm: 7.1.0 + Primus: 0.3.0 + Primus Turbo: 0.1.1 + PyTorch: 2.10.0.dev20251112+rocm7.1 + Python: "3.10" + Transformer Engine: 2.4.0.dev0+32e2d1d4 + Flash Attention: 2.8.3 + hipBLASLt: 1.2.0-09ab7153e2 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 4 Scout 17B-16E + mad_tag: pyt_train_llama-4-scout-17b-16e + model_repo: Llama-4-17B_16E + url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.3 70B + mad_tag: pyt_train_llama-3.3-70b + model_repo: Llama-3.3-70B + url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 3.2 1B + mad_tag: pyt_train_llama-3.2-1b + model_repo: Llama-3.2-1B + url: https://huggingface.co/meta-llama/Llama-3.2-1B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 3B + mad_tag: pyt_train_llama-3.2-3b + model_repo: Llama-3.2-3B + url: https://huggingface.co/meta-llama/Llama-3.2-3B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 Vision 11B + mad_tag: pyt_train_llama-3.2-vision-11b + model_repo: Llama-3.2-Vision-11B + url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.2 Vision 90B + mad_tag: pyt_train_llama-3.2-vision-90b + model_repo: Llama-3.2-Vision-90B + url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.1 8B + mad_tag: pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain] + - model: Llama 3.1 70B + mad_tag: pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: BF16 + training_modes: [pretrain, finetune_fw, finetune_lora] + - model: Llama 3.1 405B + mad_tag: pyt_train_llama-3.1-405b + model_repo: Llama-3.1-405B + url: https://huggingface.co/meta-llama/Llama-3.1-405B + precision: BF16 + training_modes: [finetune_qlora] + - model: Llama 3 8B + mad_tag: pyt_train_llama-3-8b + model_repo: Llama-3-8B + url: https://huggingface.co/meta-llama/Meta-Llama-3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3 70B + mad_tag: pyt_train_llama-3-70b + model_repo: Llama-3-70B + url: https://huggingface.co/meta-llama/Meta-Llama-3-70B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 7B + mad_tag: pyt_train_llama-2-7b + model_repo: Llama-2-7B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 2 13B + mad_tag: pyt_train_llama-2-13b + model_repo: Llama-2-13B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 70B + mad_tag: pyt_train_llama-2-70b + model_repo: Llama-2-70B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_lora, finetune_qlora] + - group: OpenAI + tag: openai + models: + - model: GPT OSS 20B + mad_tag: pyt_train_gpt_oss_20b + model_repo: GPT-OSS-20B + url: https://huggingface.co/openai/gpt-oss-20b + precision: BF16 + training_modes: [HF_finetune_lora] + - model: GPT OSS 120B + mad_tag: pyt_train_gpt_oss_120b + model_repo: GPT-OSS-120B + url: https://huggingface.co/openai/gpt-oss-120b + precision: BF16 + training_modes: [HF_finetune_lora] + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek V2 16B + mad_tag: primus_pyt_train_deepseek-v2 + model_repo: DeepSeek-V2 + url: https://huggingface.co/deepseek-ai/DeepSeek-V2 + precision: BF16 + training_modes: [pretrain] + - group: Qwen + tag: qwen + models: + - model: Qwen 3 8B + mad_tag: pyt_train_qwen3-8b + model_repo: Qwen3-8B + url: https://huggingface.co/Qwen/Qwen3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 3 32B + mad_tag: pyt_train_qwen3-32b + model_repo: Qwen3-32 + url: https://huggingface.co/Qwen/Qwen3-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 32B + mad_tag: pyt_train_qwen2.5-32b + model_repo: Qwen2.5-32B + url: https://huggingface.co/Qwen/Qwen2.5-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 72B + mad_tag: pyt_train_qwen2.5-72b + model_repo: Qwen2.5-72B + url: https://huggingface.co/Qwen/Qwen2.5-72B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2 1.5B + mad_tag: pyt_train_qwen2-1.5b + model_repo: Qwen2-1.5B + url: https://huggingface.co/Qwen/Qwen2-1.5B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 2 7B + mad_tag: pyt_train_qwen2-7b + model_repo: Qwen2-7B + url: https://huggingface.co/Qwen/Qwen2-7B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - group: Stable Diffusion + tag: sd + models: + - model: Stable Diffusion XL + mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning + model_repo: SDXL + url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 + precision: BF16 + training_modes: [posttrain] + - group: Flux + tag: flux + models: + - model: FLUX.1-dev + mad_tag: pyt_train_flux + model_repo: Flux + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + precision: BF16 + training_modes: [posttrain] + - group: NCF + tag: ncf + models: + - model: NCF + mad_tag: pyt_ncf_training + model_repo: + url: https://github.com/ROCm/FluxBenchmark + precision: FP32 + - group: DLRM + tag: dlrm + models: + - model: DLRM v2 + mad_tag: pyt_train_dlrm + model_repo: DLRM + url: https://github.com/AMD-AGI/DLRMBenchmark + training_modes: [pretrain] diff --git a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml index 852e1a970..8acf7e134 100644 --- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml @@ -1,5 +1,5 @@ docker: - pull_tag: rocm/primus:v25.10 + pull_tag: rocm/primus:v25.11 docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 components: ROCm: 7.1.0 diff --git a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml index 6c0f09bd6..6e427f023 100644 --- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml @@ -1,5 +1,5 @@ docker: - pull_tag: rocm/primus:v25.10 + pull_tag: rocm/primus:v25.11 docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197 components: ROCm: 7.1.0 @@ -25,8 +25,8 @@ model_groups: - group: DeepSeek tag: deepseek models: - - model: DeepSeek V2 16B - mad_tag: primus_pyt_train_deepseek-v2 - model_repo: DeepSeek-V2 - url: https://huggingface.co/deepseek-ai/DeepSeek-V2 + - model: DeepSeek V3 16B + mad_tag: primus_pyt_train_deepseek-v3-16b + model_repo: DeepSeek-V3 + url: https://huggingface.co/deepseek-ai/DeepSeek-V3 precision: BF16 diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst index bfd9ad3cc..e3f1955a2 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst @@ -52,7 +52,7 @@ accelerate training workloads: - {{ component_version }} {% endfor %} - .. _amd-megatron-lm-model-support-v2510: + .. _amd-megatron-lm-model-support-v25.11: Supported models ================ @@ -97,7 +97,7 @@ accelerate training workloads: Some models, such as Llama, require an external license agreement through a third party (for example, Meta). -.. _amd-megatron-lm-performance-measurements-v2510: +.. _amd-megatron-lm-performance-measurements-v25.11: Performance measurements ======================== @@ -129,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben `. This suite of tests will help you verify and fine-tune your system's configuration. -.. _mi300x-amd-megatron-lm-training-v2510: +.. _mi300x-amd-megatron-lm-training-v25.11: Environment setup ================= @@ -138,7 +138,7 @@ Use the following instructions to set up the environment, configure the script t reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker image. -.. _amd-megatron-lm-requirements-v2510: +.. _amd-megatron-lm-requirements-v25.11: Download the Docker image ------------------------- @@ -190,7 +190,7 @@ Download the Docker image The Docker container hosts a verified commit of ``__. -.. _amd-megatron-lm-environment-setup-v2510: +.. _amd-megatron-lm-environment-setup-v25.11: Configuration ============= @@ -200,39 +200,39 @@ Configuration Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` directory of ``__ to configure your training run. - Options can also be passed as command line arguments as described in :ref:`Run training `. + Options can also be passed as command line arguments as described in :ref:`Run training `. .. note:: - See :ref:`Key options ` for more information on configuration options. + See :ref:`Key options ` for more information on configuration options. Multi-node configuration ------------------------ @@ -240,7 +240,7 @@ Multi-node configuration Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands. -.. _amd-megatron-lm-tokenizer-v2510: +.. _amd-megatron-lm-tokenizer-v25.11: Tokenizer --------- @@ -377,7 +377,7 @@ Download the dataset ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. Remember to either pre-download the tokenizer or setup Hugging Face access - otherwise when needed -- see the :ref:`Tokenizer ` section. + otherwise when needed -- see the :ref:`Tokenizer ` section. .. note:: @@ -479,13 +479,13 @@ Download the dataset Ensure that the files are accessible inside the Docker container. -.. _amd-megatron-lm-run-training-v2510: +.. _amd-megatron-lm-run-training-v25.11: Run training ============ Use the following example commands to set up the environment, configure -:ref:`key options `, and run training on +:ref:`key options `, and run training on MI300X Series GPUs with the AMD Megatron-LM environment. Before starting training, export the following environment variables. @@ -920,7 +920,7 @@ Single node training RECOMPUTE_ACTIVATIONS=full \ CKPT_FORMAT=torch_dist -.. _amd-megatron-lm-multi-node-examples-v2510: +.. _amd-megatron-lm-multi-node-examples-v25.11: Multi-node training examples ---------------------------- @@ -971,7 +971,7 @@ training on 16 nodes, try the following command: sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh -.. _amd-megatron-lm-benchmark-test-vars-v2510: +.. _amd-megatron-lm-benchmark-test-vars-v25.11: Key options ----------- diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst index e7d0ebc12..964afb5a0 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst @@ -16,7 +16,7 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub ` * `Docker Hub `__ + * - v25.10 + - + * ROCm 7.1.0 + * PyTorch 2.10.0.dev20251112+rocm7.1 + - + * :doc:`Primus Megatron documentation ` + * :doc:`Megatron-LM (legacy) documentation ` + * `Docker Hub `__ + * - v25.9 - * ROCm 7.0.0 diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst new file mode 100644 index 000000000..960fdca1b --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst @@ -0,0 +1,1046 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +****************************************** +Training a model with Megatron-LM on ROCm +****************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../megatron-lm` for the latest version. + + For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`. + + Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow. + To learn how to migrate workloads from Megatron-LM to Primus with Megatron, + see :doc:`megatron-lm-primus-migration-guide`. + +The `Megatron-LM framework for ROCm `_ is +a specialized fork of the robust Megatron-LM, designed to enable efficient +training of large-scale language models on AMD GPUs. By leveraging AMD +Instinctâ„¢ GPUs, Megatron-LM delivers enhanced scalability, performance, and +resource utilization for AI workloads. It is +purpose-built to support models like Llama, DeepSeek, and Mixtral, +enabling developers to train next-generation AI models more +efficiently. + +AMD provides ready-to-use Docker images for MI355X, MI350X, MI325X, and MI300X +GPUs containing essential components, including PyTorch, ROCm libraries, and +Megatron-LM utilities. It contains the following software components to +accelerate training workloads: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml + + .. tab-set:: + + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in data.docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + + .. _amd-megatron-lm-model-support-v2510: + + Supported models + ================ + + The following models are supported for training performance benchmarking with Megatron-LM and ROCm + on AMD Instinct MI300X Series GPUs. + Some instructions, commands, and training recommendations in this documentation might + vary by model -- select one to get started. + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +.. _amd-megatron-lm-performance-measurements-v2510: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `__ +page provides reference throughput and latency measurements for training +popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `__ + only reflects the latest version of this training benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-megatron-lm-training-v2510: + +Environment setup +================= + +Use the following instructions to set up the environment, configure the script to train models, and +reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker +image. + +.. _amd-megatron-lm-requirements-v2510: + +Download the Docker image +------------------------- + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml + + {% set docker = data.docker %} + 1. Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + 2. Launch the Docker container. + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} + +3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start megatron_training_env + docker exec -it megatron_training_env bash + +4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support. + To roll back to using Megatron-LM, follow these steps: + + .. code-block:: shell + + cd /workspace/Megatron-LM/ + pip uninstall megatron-core + pip install -e . + +The Docker container hosts a verified commit of +``__. + +.. _amd-megatron-lm-environment-setup-v2510: + +Configuration +============= + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b + + Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Multi-node configuration +------------------------ + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands. + +.. _amd-megatron-lm-tokenizer-v2510: + +Tokenizer +--------- + +You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples. +If the tokenizer is not found, it'll be downloaded if publicly available. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + If you do not have Llama 3.3 tokenizer locally, you need to use your + personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer. + See `Llama-3.3-70B-Instruct + `_. After you are + authorized, use your ``HF_TOKEN`` to download the tokenizer and set the + variable ``TOKENIZER_MODEL`` to the tokenizer path. + + .. code-block:: shell + + export HF_TOKEN= + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-8B" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-70B" + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite" + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Download the Mixtral tokenizer. + + .. code-block:: shell + + mkdir tokenizer + cd tokenizer + export HF_TOKEN= + wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model + + Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL=tokenizer/tokenizer.model + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-7B" + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-72B" + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default + value is ``1`` for enabled. + + .. code-block:: bash + + MOCK_DATA=1 + +* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 + + DATA_PATH="/data/bookcorpus_text_sentence" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Download the dataset +^^^^^^^^^^^^^^^^^^^^ + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy + + For Llama models, use the `prepare_dataset.sh + `_ script + to prepare your dataset. + To download the dataset, set the ``DATASET`` variable to the dataset you'd + like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and + ``DATASET=bookcorpus``. + + .. code-block:: shell + + DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset + DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset + + ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. + Remember to either pre-download the tokenizer or setup Hugging Face access + otherwise when needed -- see the :ref:`Tokenizer ` section. + + .. note:: + + When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx`` + as in the following example: + + .. code-block:: shell + + DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir mixtral-datasets + cd mixtral-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/mixtral-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir -p temp/qwen-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/qwen-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. _amd-megatron-lm-run-training-v2510: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +MI300X Series GPUs with the AMD Megatron-LM environment. + +Before starting training, export the following environment variables. + +.. tab-set:: + + .. tab-item:: MI355X and MI350X + + .. code-block:: shell + + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NVTE_CK_USES_BWD_V3=1 + + .. tab-item:: MI325X and MI300X + + .. code-block:: shell + + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NVTE_CK_USES_BWD_V3=1 + + # Set this on MI325X/MI300X only + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + +Single node training +-------------------- + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MBS=2 \ + BS=16 \ + TE_FP8=0 \ + TP=1 \ + PP=1 \ + FSDP=1 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=512 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=10 \ + GEMM_TUNING=0 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + For Llama 3.1 8B BF16, use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=512 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=10 \ + GEMM_TUNING=1 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + + To run the training on a single node for Llama 3.1 70B FP8, use the + following command. + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + bash examples/llama/train_llama3.sh + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + FP8_WEIGHT_TRANSPOSE_CACHE=0 \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + NUM_LAYERS=40 \ + bash examples/llama/train_llama3.sh + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b + + To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + For Llama 2 7B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-2-70b + + To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=7 \ + BS=56 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + FORCE_BALANCE=true \ + RUN_ENV=cluster \ + MODEL_SIZE=671B \ + TRAIN_ITERS=50 \ + SEQ_LEN=4096 \ + NUM_LAYERS=3 \ + MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \ + PR=bf16 \ + TP=1 PP=1 ETP=1 EP=8 \ + GEMM_TUNING=1 \ + NVTE_CK_USES_BWD_V3=1 \ + USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \ + GPT_LAYER_IN_TE=true \ + bash examples/deepseek_v3/train_deepseekv3.sh + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + GEMM_TUNING=1 \ + PR=bf16 \ + MBS=4 \ + AC=none \ + SEQ_LEN=4096 \ + PAD_LEN=4096 \ + TRAIN_ITERS=20 \ + bash examples/deepseek_v2/train_deepseekv2.sh + + .. note:: + + Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault + for large iterations. + For stability, it's recommended to use Primus for this workload. + See :doc:`primus-megatron`. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=0 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=none \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=4096 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x7B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=4 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=full \ + NUM_LAYERS=4 \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=8192 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x22B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=0 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + + For FP8, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=1 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + FSDP=1 \ + CP=1 \ + PP=1 \ + MBS=3 \ + BS=24 \ + TE_FP8=0 \ + MODEL_SIZE=72 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-72B \ + RECOMPUTE_ACTIVATIONS=full \ + CKPT_FORMAT=torch_dist + +.. _amd-megatron-lm-multi-node-examples-v2510: + +Multi-node training examples +---------------------------- + +To run training on multiple nodes, launch the Docker container on each node. +For example, for Llama 3 using a two node setup (``NODE0`` as the master node), +use these commands. + +* On the master node ``NODE0``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=0 \ + bash examples/llama/train_llama3.sh + +* On the worker node ``NODE1``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=1 \ + bash examples/llama/train_llama3.sh + +Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is +provided in +``__ to +enable training at scale under a SLURM environment. For example, to run +training on 16 nodes, try the following command: + +.. code-block:: shell + + sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh + +.. _amd-megatron-lm-benchmark-test-vars-v2510: + +Key options +----------- + +The benchmark tests support the following sets of variables. + +``TEE_OUTPUT`` + ``1`` to enable training logs or ``0`` to disable. + +``TE_FP8`` + ``0`` for B16 or ``1`` for FP8 -- ``0`` by default. + +``GEMM_TUNING`` + ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels. + +``USE_FLASH_ATTN`` + ``1`` to enable Flash Attention. + +``FSDP`` + ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``, + ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled. + +``ENABLE_PROFILING`` + ``1`` to enable PyTorch profiling for performance analysis. + +``transformer-impl`` + ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE. + +``MODEL_SIZE`` + ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example. + +``TOTAL_ITERS`` + The total number of iterations -- ``10`` by default. + +``MOCK_DATA`` + ``1`` to use mock data or ``0`` to use real data you provide. + +``MBS`` + Micro batch size. + +``BS`` + Global batch size. + +``TP`` / ``TP_SIZE`` + Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on. + +``EP`` / ``EP_SIZE`` + Expert parallel for MoE models. + +``SEQ_LENGTH`` + Input sequence length. + +``PR`` + Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs. + +``AC`` + Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default. + +``NUM_LAYERS`` + Use reduced number of layers as a proxy model. + +``RECOMPUTE_NUM_LAYERS`` + Number of layers used for checkpointing recompute. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst new file mode 100644 index 000000000..bdbef04f3 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst @@ -0,0 +1,1081 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +******************************************** +Training a model with Primus and Megatron-LM +******************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../primus-megatron` for the latest version. + +`Primus `__ is a unified and flexible +training framework for AMD Instinct GPUs designed to support multiple training +engine backends -- including Megatron -- to deliver scalable, high-performance +model training. Performance acceleration is powered by `Primus Turbo +`__ and ROCm libraries. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including Megatron-LM and :doc:`torchtitan `. + + Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM + training ` workflow. To learn how to migrate workloads from + Megatron-LM to Primus with Megatron, see + :doc:`megatron-lm-primus-migration-guide`. + +AMD provides a ready-to-use Docker images for MI355X, MI350X, +MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and +Megatron-LM. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + .. tab-set:: + + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in data.docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +.. _amd-primus-megatron-lm-model-support-v2510: + +Supported models +================ + +The following models are pre-optimized for performance on AMD Instinct GPUs. +Some instructions, commands, and training examples in this documentation +might vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-primus-megatron-lm-training-v2510: + +Environment setup +================= + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + Use the following instructions to set up the environment, configure the script to train models, and + reproduce the benchmark results on AMD Instinct GPUs. + +.. _amd-primus-megatron-lm-requirements-v2510: + +Pull the Docker image + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set docker = data.docker %} + + 1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + 2. Launch the Docker container. + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + --shm-size 128G \ + --name primus_training_env \ + {{ docker.pull_tag }} + + Use these commands if you exit the ``primus_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start primus_training_env + docker exec -it primus_training_env bash + +The Docker container hosts verified branch ``release/v25.10`` of the `Primus +`__ repository. + +.. _amd-primus-megatron-lm-environment-setup-v2510: + +Configuration +============= + +Primus defines a training configuration in YAML for each model in +`examples/megatron/configs `__. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{ model.mad_tag }} + + For example, to update training parameters for {{ model.model }}, you can + update ``examples/megatron/configs/{{ model.config_name }}``. Training + configuration YAML files for other models follow this naming convention. + + {% endfor %} + {% endfor %} + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default + value is ``true`` for enabled. + + .. code-block:: yaml + + mock_data: true + +* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset. + + .. code-block:: bash + + mock_data: false + train_data_path: /path/to/your/dataset + + Ensure that the files are accessible inside the Docker container. + +.. _amd-primus-megatron-lm-tokenizer-v2510: + +Tokenizer +--------- + +Set the ``HF_TOKEN`` environment variable with +right permissions to access the tokenizer for each model. + +.. code-block:: bash + + # Export your HF_TOKEN in the workspace + export HF_TOKEN= + +.. note:: + + In Primus, each model uses a tokenizer from Hugging Face. For example, Llama + 3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and + ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model + `__ + definition. + +.. _amd-primus-megatron-lm-run-training-v2510: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +AMD Instinct GPUs using Primus with the Megatron backend. + +Single node training +-------------------- + +To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command: + +.. code-block:: shell + + pip install -r requirements.txt + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.3 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run pre-training for Llama 3.3 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 6 \ + --global_batch_size 48 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 2 \ + --global_batch_size 16 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 8B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run pre-training for Llama 3.1 8B FP8, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --micro_batch_size 4 \ + --global_batch_size 512 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + For Llama 3.1 8B BF16, use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 512 \ + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run pre-training for Llama 3.1 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 32 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + + To run the training on a single node for Llama 3.1 70B FP8, use the following command. + + .. note:: + + The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes + to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs + can support the full 70B model with FP8 precision on a single node. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --no_fp8_weight_transpose_cache true \ + --micro_batch_size 3 \ + --global_batch_size 24 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 40 \ + --fp8 hybrid \ + --no_fp8_weight_transpose_cache true + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 7B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run pre-training for Llama 2 7B FP8, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid \ + --micro_batch_size 13 \ + --global_batch_size 416 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + To run pre-training for Llama 2 7B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 10 \ + --global_batch_size 640 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run pre-training for Llama 2 70B BF16, run: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 17 \ + --global_batch_size 272 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to DeepSeek-V3. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 3 \ + --moe_layer_freq 1 \ + --train_iters 50 \ + --micro_batch_size 8 \ + --global_batch_size 64 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 3 \ + --moe_layer_freq 1 \ + --micro_batch_size 3 \ + --global_batch_size 192 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to DeepSeek-V2-Lite. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 12 \ + --global_batch_size 768 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --global_batch_size 256 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Mixtral 8x7B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 4 \ + --global_batch_size 256 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Mixtral 8x22B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy, + use the following command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 4 \ + --pipeline_model_parallel_size 1 \ + --micro_batch_size 2 \ + --global_batch_size 16 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 4 \ + --pipeline_model_parallel_size 1 \ + --micro_batch_size 1 \ + --global_batch_size 16 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Qwen 2.5 7B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command: + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 16 \ + --global_batch_size 768 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + + For FP8, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + --micro_batch_size 20 \ + --global_batch_size 800 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Qwen 2.5 72B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI350X + + .. code-block:: shell + + EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --micro_batch_size 16 \ + --global_batch_size 256 + + .. tab-item:: MI300X + :sync: MI325X and MI300X + + .. code-block:: shell + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + + EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 + +.. _amd-primus-megatron-multi-node-examples-v2510: + +Multi-node training examples +---------------------------- + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. + +To run training on multiple nodes, you can use the +`run_slurm_pretrain.sh `__ +to launch the multi-node workload. Use the following steps to setup your environment: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set docker = data.docker %} + .. code-block:: shell + + git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git + cd Primus + git checkout release/v25.10 + git submodule update --init --recursive + + export DOCKER_IMAGE={{ docker.pull_tag }} + export HF_TOKEN= + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NCCL_IB_HCA= # specify which RDMA interfaces to use for communication + export NCCL_SOCKET_IFNAME= # your Network Interface + export GLOO_SOCKET_IFNAME= # your Network Interface + export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE + + # Set the variables for better performance + # only on MI325X and MI300X + export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 + export NVTE_CK_IS_V3_ATOMIC_FP32=1 + +.. note:: + + * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container. + * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster. + * To find your network interface, you can use ``ip a``. + * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices. + * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate. + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 8B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Llama 3.1 8B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --global_batch_size 1024 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 7B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Llama 2 7B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --global_batch_size 2048 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.1 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Llama 3.1 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.1 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Llama 2 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. + NNODES=8 \ + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 10 \ + --global_batch_size 640 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 2 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 1536 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 3.3 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Llama 3.3 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.3 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 \ + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Mixtral 8x7B BF16 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 256 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + Once setup is complete, run the appropriate training command. + The following run commands are tailored to Llama 2 70B. + See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + + To train Qwen2.5 72B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. + # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 \ + EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 8 \ + --global_batch_size 512 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + +.. _amd-primus-megatron-lm-benchmark-test-vars-v2510: + +Key options +----------- + +The following are key options to take note of + +fp8 + ``hybrid`` enables FP8 GEMMs. + +use_torch_fsdp2 + ``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled, + set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``. + +profile + To enable PyTorch profiling, set these parameters: + + .. code-block:: yaml + + profile: true + use_pytorch_profiler: true + profile_step_end: 7 + profile_step_start: 6 + +train_iters + The total number of iterations (default: 50). + +mock_data + True by default. + +micro_batch_size + Micro batch size. + +global_batch_size + Global batch size. + +recompute_granularity + For activation checkpointing. + +num_layers + For using a reduced number of layers as with proxy models. + +Known issues +============ + +DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error +due to a memory free issue. However, this does not impacts training runs. All +iterations, in this case 50, should have been completed before the exit and +the results should be available in the end. + +Further reading +=============== + +- For an introduction to Primus, see `Primus: A Lightweight, Unified Training + Framework for Large Models on AMD GPUs `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. + +This training environment now uses Primus with Megatron as the primary +configuration. Limited support for the legacy ROCm Megatron-LM is still +available; see the :doc:`../megatron-lm` documentation. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst new file mode 100644 index 000000000..d85390b7f --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst @@ -0,0 +1,448 @@ +:orphan: + +.. meta:: + :description: How to train a model using PyTorch for ROCm. + :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker + +**************************************** +Training a model with Primus and PyTorch +**************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Primus PyTorch training + performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version. + +`Primus `__ is a unified and flexible +LLM training framework designed to streamline training. It streamlines LLM +training on AMD Instinct GPUs using a modular, reproducible configuration paradigm. +Primus now supports the PyTorch torchtitan backend. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including torchtitan and :doc:`Megatron-LM `. + + Primus with the PyTorch torchtitan backend is designed to replace the + :doc:`ROCm PyTorch training ` workflow. See + :doc:`pytorch-training` to see steps to run workloads without Primus. + +AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and +MI300X GPUs containing essential components for Primus and PyTorch training +with Primus Turbo optimizations. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml + + .. tab-set:: + + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in data.docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +.. _amd-primus-pytorch-model-support-v2510: + +Supported models +================ + +The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs. +Some instructions, commands, and training recommendations in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. seealso:: + + For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models, + see the documentation :doc:`pytorch-training` (without Primus) + +.. _amd-primus-pytorch-performance-measurements-v2510: + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +This Docker image is optimized for specific model configurations outlined +below. Performance can vary for other training workloads, as AMD +doesn’t test configurations and run conditions outside those described. + +Pull the Docker image +===================== + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml + + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ data.docker.pull_tag }} + +Run training +============ + +Once the setup is complete, choose between the following two workflows to start benchmarking training. +For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus). +For best performance on MI325X, MI350X, and MI355X GPUs, you might need to +tweak some configurations (such as batch sizes). + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml + + {% set docker = data.docker %} + {% set model_groups = data.model_groups %} + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run command is tailored to {{ model.model }}. + See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one node with the {{ model.precision }} data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the + model are collected in ``~/MAD/perf.csv``. + + {% endfor %} + {% endfor %} + + .. tab-item:: Primus benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run commands are tailored to {{ model.model }}. + See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. + + .. rubric:: Download the Docker image and required packages + + 1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + 2. Run the Docker container. + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + .. rubric:: Prepare training datasets and dependencies + + The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + .. rubric:: Pretraining + + To get started, navigate to the ``Primus`` directory in your container. + + .. code-block:: + + cd /workspace/Primus + + Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script + included with Primus with the appropriate options. + + .. rubric:: Benchmarking examples + + .. container:: model-doc primus_pyt_train_llama-3.1-8b + + Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 6 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 6 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 4 + + + To train Llama 3.1 8B with FP8 precision, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 8 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 7 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 5 + + .. container:: model-doc primus_pyt_train_llama-3.1-70b + + Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 8 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 6 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 4 + + To train Llama 3.1 70B with FP8 precision, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 6 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 5 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 3 + + .. container:: model-doc primus_pyt_train_deepseek-v2 + + Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X and MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 16 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 10 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 8 + + To train DeepSeek V2 16B with FP8 precision, use the following command. + + .. tab-set:: + + .. tab-item:: MI355X and MI350X + :sync: MI355X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 16 + + .. tab-item:: MI325X + :sync: MI325X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 8 + + .. tab-item:: MI300X + :sync: MI300X + + .. code-block:: shell + + EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ + bash examples/run_pretrain.sh --training.local_batch_size 8 + {% endfor %} + {% endfor %} + +Further reading +=============== + +- For an introduction to Primus, see `Primus: A Lightweight, Unified Training + Framework for Large Models on AMD GPUs `__. + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst index 57e42aff4..87dda15e5 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst @@ -16,7 +16,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub < - Components - Resources - * - v25.10 (latest) + * - v25.11 - * ROCm 7.1.0 * PyTorch 2.10.0.dev20251112+rocm7.1 @@ -25,6 +25,15 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub < * :doc:`PyTorch training (legacy) documentation <../pytorch-training>` * `Docker Hub `__ + * - v25.10 + - + * ROCm 7.1.0 + * PyTorch 2.10.0.dev20251112+rocm7.1 + - + * :doc:`Primus PyTorch Training documentation ` + * :doc:`PyTorch training (legacy) documentation ` + * `Docker Hub `__ + * - v25.9 - * ROCm 7.0.0 diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst new file mode 100644 index 000000000..1e7b471c1 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst @@ -0,0 +1,669 @@ +:orphan: + +.. meta:: + :description: How to train a model using PyTorch for ROCm. + :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker + +************************************** +Training a model with PyTorch on ROCm +************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm PyTorch training + performance benchmark documentation. See :doc:`../pytorch-training` for the latest version. + +.. note:: + + For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training + `__ Docker Hub registry will be + deprecated soon in favor of `rocm/primus `__. + The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks, + including torchtitan and :doc:`Megatron-LM <../primus-megatron>`. + + See :doc:`../primus-pytorch` for details. + +PyTorch is an open-source machine learning framework that is widely used for +model training with GPU-optimized components for transformer-based models. +The PyTorch for ROCm training Docker image provides a prebuilt optimized +environment for fine-tuning and pretraining a model on AMD Instinct MI325X +and MI300X GPUs. It includes the following software components to accelerate +training workloads: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml + + .. tab-set:: + + .. tab-item:: {{ data.docker.pull_tag }} + :sync: {{ data.docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in data.docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +.. _amd-pytorch-training-model-support-v2510: + +Supported models +================ + +The following models are pre-optimized for performance on the AMD Instinct +MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and +training recommendations in this documentation might vary by model -- select +one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. _amd-pytorch-training-supported-training-modes-v2510: + +The following table lists supported training modes per model. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. dropdown:: Supported training modes + + .. list-table:: + :header-rows: 1 + + * - Model + - Supported training modes + + {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if model.training_modes %} + * - {{ model.model }} + - ``{{ model.training_modes | join('``, ``') }}`` + + {% endif %} + {% endfor %} + {% endfor %} + + .. note:: + + Some model and fine-tuning combinations are not listed. This is + because the `upstream torchtune repository `__ + doesn't provide default YAML configurations for them. + For advanced usage, you can create a custom configuration to enable + unlisted fine-tuning methods by using an existing file in the + ``/workspace/torchtune/recipes/configs`` directory as a template. + +.. _amd-pytorch-training-performance-measurements-v2510: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and latency measurements for training +popular AI models. + +.. note:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + should not be interpreted as the peak performance achievable by AMD + Instinct MI325X and MI300X GPUs or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +This Docker image is optimized for specific model configurations outlined +below. Performance can vary for other training workloads, as AMD +doesn’t test configurations and run conditions outside those described. + +Run training +============ + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml + + {% set docker = data.docker %} + {% set model_groups = data.model_groups %} + + Once the setup is complete, choose between two options to start benchmarking training: + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following run command is tailored to {{ model.model }}. + See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. + + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one node with the {{ model.precision }} data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the + model are collected in ``~/MAD/perf.csv``. + + {% endfor %} + {% endfor %} + + .. tab-item:: Standalone benchmarking + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + The following commands are tailored to {{ model.model }}. + See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. + + {% endfor %} + {% endfor %} + + .. rubric:: Download the Docker image and required packages + + 1. Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + 2. Launch the Docker container. + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ docker.pull_tag }} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + 3. In the Docker container, clone the ``__ + repository and navigate to the benchmark scripts directory + ``/workspace/MAD/scripts/pytorch_train``. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/pytorch_train + + .. rubric:: Prepare training datasets and dependencies + + 1. The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + 2. Run the setup script to install libraries and datasets needed for benchmarking. + + .. code-block:: shell + + ./pytorch_benchmark_setup.sh + + .. container:: model-doc pyt_train_llama-3.1-8b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + .. container:: model-doc pyt_train_llama-3.1-70b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + * - ``torchdata`` + - `TorchData `__ + + * - ``tomli`` + - `Tomli `__ + + * - ``tiktoken`` + - `tiktoken `__ + + * - ``blobfile`` + - `blobfile `__ + + * - ``tabulate`` + - `tabulate `__ + + * - ``wandb`` + - `Weights & Biases `__ + + * - ``sentencepiece`` + - `SentencePiece `__ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `__ 2.18.0 + + .. container:: model-doc pyt_train_flux + + ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `__ 3.2.0 + + * - ``sentencepiece`` + - `SentencePiece `__ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `__ 2.18.0 + + * - ``csvkit`` + - `csvkit `__ 2.0.1 + + * - ``deepspeed`` + - `DeepSpeed `__ 0.16.2 + + * - ``diffusers`` + - `Hugging Face Diffusers `__ 0.31.0 + + * - ``GitPython`` + - `GitPython `__ 3.1.44 + + * - ``opencv-python-headless`` + - `opencv-python-headless `__ 4.10.0.84 + + * - ``peft`` + - `PEFT `__ 0.14.0 + + * - ``protobuf`` + - `Protocol Buffers `__ 5.29.2 + + * - ``pytest`` + - `PyTest `__ 8.3.4 + + * - ``python-dotenv`` + - `python-dotenv `__ 1.0.1 + + * - ``seaborn`` + - `Seaborn `__ 0.13.2 + + * - ``transformers`` + - `Transformers `__ 4.47.0 + + ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face: + + * `frank-chieng/chinese_architecture_siheyuan `__ + + {% for model_group in model_groups %} + {% for model in model_group.models %} + {% set training_modes = model.training_modes %} + {% set training_mode_descs = { + "pretrain": "Benchmark pre-training.", + "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision." + } %} + {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %} + {% if available_modes %} + + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Pretraining + + To start the pre-training benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + {% if model.mad_tag == "pyt_train_dlrm" %} + + 1. Go to the DLRM directory. + + .. code-block:: shell + + cd /workspace/DLRMBenchmark + + 2. To run the single node training benchmark for DLRM-v2 with TF32 precision, + run the following script. + + .. code-block:: shell + + ./launch_training_single_node.sh + + To run with MAD within the Docker container, use the following command. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t pretrain -m DLRM + + {% else %} + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + {% if model.mad_tag == "pyt_train_flux" %} + .. container:: model-doc {{ model.mad_tag }} + + .. note:: + + Currently, FLUX models are not supported out-of-the-box on this Docker. + To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6` + + Occasionally, downloading the Flux dataset might fail. In the event of this + error, manually download it from Hugging Face at + `black-forest-labs/FLUX.1-dev `_ + and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access + the required dataset. + {% endif %} + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %} + - Only Llama 3.1 8B supports FP8 precision. + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + {% endif %} + {% endif %} + + {% set training_modes = model.training_modes %} + {% set training_mode_descs = { + "posttrain": "Benchmark post-training.", + } %} + {% set available_modes = training_modes | select("in", ["posttrain"]) | list %} + {% if available_modes %} + + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Post-training + + To start the post-training benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %} + - Only Llama 3.1 8B supports FP8 precision. + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + {% endif %} + + {% set training_mode_descs = { + "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).", + "finetune_lora": "LoRA fine-tuning (BF16 supported).", + "finetune_qlora": "QLoRA fine-tuning (BF16 supported).", + "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.", + } %} + {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %} + {% if available_modes %} + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Fine-tuning + + To start the fine-tuning benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + See :ref:`supported training modes `. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t $training_mode \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %} + - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %} + + * - ``$sequence_length`` + - Between 2048 and 16384. + - Sequence length for the language model. + + {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %} + .. note:: + + For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B), + use the following torchtune commit for compatibility: + + .. code-block:: shell + + git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e + + {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %} + .. note:: + + You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of + input tensor should be smaller than max_seq_len (4096)``. + This error indicates that an input sequence is longer than the model's maximum context window. + + Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096 + tokens in this case). You can resolve this by truncating the input or splitting + it into smaller chunks before passing it to the model. + + Note on reproducibility: The results in this guide are based on + commit ``b4c98ac`` from the upstream + ``__ repository. For the + latest updates, you can use the main branch. + + {% endif %} + {% endif %} + {% endfor %} + {% endfor %} + + .. rubric:: Benchmarking examples + + For examples of benchmarking commands, see ``__. + +.. _amd-pytorch-training-multinode-examples-v2510: + +Multi-node training +------------------- + +Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node +training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands. + +Pre-training +~~~~~~~~~~~~ + +Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B. + +To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + # In the MAD repository + cd scripts/pytorch_train + sbatch run_slurm_train.sh + +Fine-tuning +~~~~~~~~~~~ + +Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B. + +To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + huggingface-cli login # Get access to HF Llama model space + huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally + # In the MAD repository + cd scripts/pytorch_train + sbatch Torchtune_Multinode.sh + +.. note:: + + Information regarding benchmark setup: + + * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``. + * You can adjust the torchtune `YAML configuration file + `__ + if you're using a different model. + * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``. + * Set the ``mounting_paths`` inside the SLURM script. + +Once the run is finished, you can find the log files in the ``result_torchtune/`` directory. + +Further reading +=============== + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst index ed0c2a637..857d79f33 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst @@ -47,7 +47,7 @@ Megatron-LM. - {{ component_version }} {% endfor %} -.. _amd-primus-megatron-lm-model-support-v2510: +.. _amd-primus-megatron-lm-model-support-v25.11: Supported models ================ @@ -108,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben `. This suite of tests will help you verify and fine-tune your system's configuration. -.. _mi300x-amd-primus-megatron-lm-training-v2510: +.. _mi300x-amd-primus-megatron-lm-training-v25.11: Environment setup ================= @@ -118,7 +118,7 @@ Environment setup Use the following instructions to set up the environment, configure the script to train models, and reproduce the benchmark results on AMD Instinct GPUs. -.. _amd-primus-megatron-lm-requirements-v2510: +.. _amd-primus-megatron-lm-requirements-v25.11: Pull the Docker image @@ -157,16 +157,16 @@ Pull the Docker image docker start primus_training_env docker exec -it primus_training_env bash -The Docker container hosts verified branch ``release/v25.10`` of the `Primus -`__ repository. +The Docker container hosts verified commit ``c4c083de`` of the `Primus +`__ repository. -.. _amd-primus-megatron-lm-environment-setup-v2510: +.. _amd-primus-megatron-lm-environment-setup-v25.11: Configuration ============= Primus defines a training configuration in YAML for each model in -`examples/megatron/configs `__. +`examples/megatron/configs `__. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml @@ -207,7 +207,7 @@ You can use either mock data or real data for training. Ensure that the files are accessible inside the Docker container. -.. _amd-primus-megatron-lm-tokenizer-v2510: +.. _amd-primus-megatron-lm-tokenizer-v25.11: Tokenizer --------- @@ -228,7 +228,7 @@ right permissions to access the tokenizer for each model. `__ definition. -.. _amd-primus-megatron-lm-run-training-v2510: +.. _amd-primus-megatron-lm-run-training-v25.11: Run training ============ @@ -252,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.3 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run pre-training for Llama 3.3 70B BF16, run: @@ -263,11 +263,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 6 \ - --global_batch_size 48 \ + EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -279,17 +276,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 2 \ - --global_batch_size 16 + EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 8B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run pre-training for Llama 3.1 8B FP8, run: @@ -300,12 +294,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid \ - --micro_batch_size 4 \ - --global_batch_size 512 \ + EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -317,10 +307,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid + EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ + bash ./examples/run_pretrain.sh For Llama 3.1 8B BF16, use the following command: @@ -331,11 +319,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 4 \ - --global_batch_size 512 \ + EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -347,15 +332,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run pre-training for Llama 3.1 70B BF16, run: @@ -366,11 +350,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 4 \ - --global_batch_size 32 + EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -382,9 +363,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh To run the training on a single node for Llama 3.1 70B FP8, use the following command. @@ -401,13 +381,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid \ - --no_fp8_weight_transpose_cache true \ - --micro_batch_size 3 \ - --global_batch_size 24 + EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -419,7 +394,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ bash ./examples/run_pretrain.sh \ --train_iters 50 \ --num_layers 40 \ @@ -430,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 7B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run pre-training for Llama 2 7B FP8, run: @@ -441,12 +416,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid \ - --micro_batch_size 13 \ - --global_batch_size 416 + EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -458,10 +429,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid + EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \ + bash ./examples/run_pretrain.sh To run pre-training for Llama 2 7B BF16, run: @@ -472,11 +441,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 10 \ - --global_batch_size 640 + EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -488,15 +454,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run pre-training for Llama 2 70B BF16, run: @@ -507,11 +472,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 17 \ - --global_batch_size 272 + EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -523,15 +485,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \ - bash ./examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \ + bash ./examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy Once setup is complete, run the appropriate training command. The following run commands are tailored to DeepSeek-V3. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy, use the following command: @@ -543,7 +504,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \ + EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \ bash examples/run_pretrain.sh \ --num_layers 3 \ --moe_layer_freq 1 \ @@ -561,19 +522,17 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \ bash examples/run_pretrain.sh \ --num_layers 3 \ --moe_layer_freq 1 \ - --micro_batch_size 3 \ - --global_batch_size 192 \ --train_iters 50 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b Once setup is complete, run the appropriate training command. The following run commands are tailored to DeepSeek-V2-Lite. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16, use the following command: @@ -585,11 +544,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 12 \ - --global_batch_size 768 + EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -601,16 +557,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --global_batch_size 256 + EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b Once setup is complete, run the appropriate training command. The following run commands are tailored to Mixtral 8x7B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run training on a single node for Mixtral 8x7B (MoE with expert parallel), use the following command: @@ -622,11 +576,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 4 \ - --global_batch_size 256 + EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -638,7 +589,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 @@ -646,7 +597,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Mixtral 8x22B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy, use the following command: @@ -658,13 +609,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --num_layers 4 \ - --pipeline_model_parallel_size 1 \ - --micro_batch_size 2 \ - --global_batch_size 16 + EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -676,7 +622,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \ bash examples/run_pretrain.sh \ --train_iters 50 \ --num_layers 4 \ @@ -688,7 +634,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the Once setup is complete, run the appropriate training command. The following run commands are tailored to Qwen 2.5 7B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run training on a single node for Qwen 2.5 7B BF16, use the following command: @@ -700,11 +646,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --micro_batch_size 16 \ - --global_batch_size 768 + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -716,9 +659,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh For FP8, use the following command. @@ -729,12 +671,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the .. code-block:: shell - EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid - --micro_batch_size 20 \ - --global_batch_size 800 + EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh .. tab-item:: MI300X :sync: MI325X and MI300X @@ -746,16 +684,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 \ - --fp8 hybrid + EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \ + bash examples/run_pretrain.sh .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b Once setup is complete, run the appropriate training command. The following run commands are tailored to Qwen 2.5 72B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To run the training on a single node for Qwen 2.5 72B BF16, use the following command. @@ -782,11 +718,10 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1 export NVTE_CK_IS_V3_ATOMIC_FP32=1 - EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \ - bash examples/run_pretrain.sh \ - --train_iters 50 + EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \ + bash examples/run_pretrain.sh -.. _amd-primus-megatron-multi-node-examples-v2510: +.. _amd-primus-megatron-multi-node-examples-v25.11: Multi-node training examples ---------------------------- @@ -805,7 +740,7 @@ to launch the multi-node workload. Use the following steps to setup your environ git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git cd Primus - git checkout release/v25.10 + git checkout c4c083de64ba3e8f19ccc9629411267108931f9e git submodule update --init --recursive export DOCKER_IMAGE={{ docker.pull_tag }} @@ -828,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster. * To find your network interface, you can use ``ip a``. * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices. - * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate. + * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate. .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 8B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Llama 3.1 8B FP8 on 8 nodes, run: @@ -843,16 +778,15 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. NNODES=8 \ - EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ bash ./examples/run_slurm_pretrain.sh \ --global_batch_size 1024 \ - --fp8 hybrid .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 7B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Llama 2 7B FP8 on 8 nodes, run: @@ -861,16 +795,15 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. NNODES=8 \ - EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \ bash ./examples/run_slurm_pretrain.sh \ --global_batch_size 2048 \ - --fp8 hybrid .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.1 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Llama 3.1 70B FP8 on 8 nodes, run: @@ -879,20 +812,18 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. NNODES=8 \ - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 4 \ --global_batch_size 256 \ --recompute_num_layers 80 \ - --no_fp8_weight_transpose_cache true \ - --fp8 hybrid To train Llama 3.1 70B BF16 on 8 nodes, run: .. code-block:: shell NNODES=8 \ - EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 1 \ --global_batch_size 256 \ @@ -902,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Llama 2 70B FP8 on 8 nodes, run: @@ -911,20 +842,18 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case. NNODES=8 \ - EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama2_70B-FP8-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 10 \ --global_batch_size 640 \ --recompute_num_layers 80 \ - --no_fp8_weight_transpose_cache true \ - --fp8 hybrid To train Llama 2 70B BF16 on 8 nodes, run: .. code-block:: shell NNODES=8 \ - EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \ bash ./examples/run_slurm_pretrain.sh \ --micro_batch_size 2 \ --global_batch_size 1536 \ @@ -934,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 3.3 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Llama 3.3 70B FP8 on 8 nodes, run: @@ -943,20 +872,18 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case NNODES=8 \ - EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.3_70B-FP8-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 4 \ --global_batch_size 256 \ --recompute_num_layers 80 \ - --no_fp8_weight_transpose_cache true \ - --fp8 hybrid To train Llama 3.3 70B BF16 on 8 nodes, run: .. code-block:: shell NNODES=8 \ - EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 1 \ --global_batch_size 256 \ @@ -966,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Mixtral 8x7B BF16 on 8 nodes, run: @@ -975,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case NNODES=8 \ - EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 2 \ --global_batch_size 256 @@ -984,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ Once setup is complete, run the appropriate training command. The following run commands are tailored to Llama 2 70B. - See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model. To train Qwen2.5 72B FP8 on 8 nodes, run: @@ -993,15 +920,13 @@ to launch the multi-node workload. Use the following steps to setup your environ # Adjust the training parameters. # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case NNODES=8 \ - EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + EXP=examples/megatron/configs/qwen2.5_72B-FP8-pretrain.yaml \ bash examples/run_slurm_pretrain.sh \ --micro_batch_size 8 \ --global_batch_size 512 \ --recompute_num_layers 80 \ - --no_fp8_weight_transpose_cache true \ - --fp8 hybrid -.. _amd-primus-megatron-lm-benchmark-test-vars-v2510: +.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11: Key options ----------- diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst index 77318bdaa..5323adced 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst @@ -45,7 +45,7 @@ with Primus Turbo optimizations. - {{ component_version }} {% endfor %} -.. _amd-primus-pytorch-model-support-v2510: +.. _amd-primus-pytorch-model-support-v25.11: Supported models ================ @@ -91,7 +91,7 @@ vary by model -- select one to get started. For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models, see the documentation :doc:`pytorch-training` (without Primus) -.. _amd-primus-pytorch-performance-measurements-v2510: +.. _amd-primus-pytorch-performance-measurements-v25.11: System validation ================= @@ -146,7 +146,7 @@ tweak some configurations (such as batch sizes). .. container:: model-doc {{ model.mad_tag }} The following run command is tailored to {{ model.model }}. - See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -184,7 +184,7 @@ tweak some configurations (such as batch sizes). .. container:: model-doc {{ model.mad_tag }} The following run commands are tailored to {{ model.model }}. - See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model. + See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model. .. rubric:: Download the Docker image and required packages @@ -220,6 +220,9 @@ tweak some configurations (such as batch sizes). docker start training_env docker exec -it training_env bash + The Docker container hosts verified commit ``c4c083de`` of the `Primus + `__ repository. + .. rubric:: Prepare training datasets and dependencies The following benchmarking examples require downloading models and datasets @@ -255,7 +258,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 6 + bash examples/run_pretrain.sh .. tab-item:: MI325X :sync: MI325X @@ -263,7 +266,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 6 + bash examples/run_pretrain.sh --training.local_batch_size 6 .. tab-item:: MI300X :sync: MI300X @@ -271,8 +274,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 4 - + bash examples/run_pretrain.sh To train Llama 3.1 8B with FP8 precision, use the following command. @@ -284,7 +286,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 8 + bash examples/run_pretrain.sh .. tab-item:: MI325X :sync: MI325X @@ -292,7 +294,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 7 + bash examples/run_pretrain.sh --training.local_batch_size 7 .. tab-item:: MI300X :sync: MI300X @@ -300,7 +302,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 5 + bash examples/run_pretrain.sh .. container:: model-doc primus_pyt_train_llama-3.1-70b @@ -314,7 +316,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 8 + bash examples/run_pretrain.sh .. tab-item:: MI325X :sync: MI325X @@ -322,7 +324,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 6 + bash examples/run_pretrain.sh --training.local_batch_size 6 .. tab-item:: MI300X :sync: MI300X @@ -330,7 +332,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 4 + bash examples/run_pretrain.sh To train Llama 3.1 70B with FP8 precision, use the following command. @@ -342,7 +344,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 6 + bash examples/run_pretrain.sh .. tab-item:: MI325X :sync: MI325X @@ -350,7 +352,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 5 + bash examples/run_pretrain.sh --training.local_batch_size 5 .. tab-item:: MI300X :sync: MI300X @@ -358,11 +360,11 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 3 + bash examples/run_pretrain.sh - .. container:: model-doc primus_pyt_train_deepseek-v2 + .. container:: model-doc primus_pyt_train_deepseek-v3-16b - Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan. + Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan. .. tab-set:: @@ -372,7 +374,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 16 + bash examples/run_pretrain.sh .. tab-item:: MI325X :sync: MI325X @@ -380,7 +382,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 10 + bash examples/run_pretrain.sh --training.local_batch_size 10 .. tab-item:: MI300X :sync: MI300X @@ -388,35 +390,7 @@ tweak some configurations (such as batch sizes). .. code-block:: shell EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 8 - - To train DeepSeek V2 16B with FP8 precision, use the following command. - - .. tab-set:: - - .. tab-item:: MI355X and MI350X - :sync: MI355X - - .. code-block:: shell - - EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 16 - - .. tab-item:: MI325X - :sync: MI325X - - .. code-block:: shell - - EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 8 - - .. tab-item:: MI300X - :sync: MI300X - - .. code-block:: shell - - EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \ - bash examples/run_pretrain.sh --training.local_batch_size 8 + bash examples/run_pretrain.sh {% endfor %} {% endfor %} diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst index f1e8c7f09..f43dab087 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst @@ -43,7 +43,7 @@ training workloads: - {{ component_version }} {% endfor %} -.. _amd-pytorch-training-model-support-v2510: +.. _amd-pytorch-training-model-support-v25.11: Supported models ================ @@ -85,7 +85,7 @@ one to get started. -.. _amd-pytorch-training-supported-training-modes-v2510: +.. _amd-pytorch-training-supported-training-modes-v25.11: The following table lists supported training modes per model. @@ -120,7 +120,7 @@ The following table lists supported training modes per model. unlisted fine-tuning methods by using an existing file in the ``/workspace/torchtune/recipes/configs`` directory as a template. -.. _amd-pytorch-training-performance-measurements-v2510: +.. _amd-pytorch-training-performance-measurements-v25.11: Performance measurements ======================== @@ -176,7 +176,7 @@ Run training .. container:: model-doc {{ model.mad_tag }} The following run command is tailored to {{ model.model }}. - See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. + See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model. 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local directory and install the required packages on the host machine. @@ -214,7 +214,7 @@ Run training .. container:: model-doc {{ model.mad_tag }} The following commands are tailored to {{ model.model }}. - See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model. + See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model. {% endfor %} {% endfor %} @@ -532,7 +532,7 @@ Run training To start the fine-tuning benchmark, use the following command with the appropriate options. See the following list of options and their descriptions. - See :ref:`supported training modes `. + See :ref:`supported training modes `. .. code-block:: shell @@ -597,7 +597,7 @@ Run training For examples of benchmarking commands, see ``__. -.. _amd-pytorch-training-multinode-examples-v2510: +.. _amd-pytorch-training-multinode-examples-v25.11: Multi-node training ------------------- diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 592ce9dc1..cb1b6c78d 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -75,8 +75,14 @@ subtrees: - entries: - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst title: Train a model with Primus and Megatron-LM + entries: + - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst + title: Train a model with Megatron-LM - file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst title: Train a model with Primus and PyTorch + entries: + - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst + title: Train a model with PyTorch - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst title: Train a model with JAX MaxText - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry