From 98029db4eeb729d7ab709d79c03f867966b275a1 Mon Sep 17 00:00:00 2001 From: Peter Park Date: Thu, 21 Aug 2025 23:50:55 -0400 Subject: [PATCH] docs: Add Primus (Megatron) training Docker documentation (#5218) --- .wordlist.txt | 6 + .../megatron-lm-benchmark-models.yaml | 27 +- .../megatron-lm-v25.6-benchmark-models.yaml | 60 + .../primus-megatron-benchmark-models.yaml | 58 + .../training/benchmark-docker/megatron-lm.rst | 53 +- .../previous-versions/megatron-lm-history.rst | 12 +- .../megatron-lm-primus-migration-guide.rst | 175 +++ .../previous-versions/megatron-lm-v25.6.rst | 1041 +++++++++++++++++ .../benchmark-docker/primus-megatron.rst | 602 ++++++++++ docs/how-to/rocm-for-ai/training/index.rst | 2 + docs/sphinx/_toc.yml.in | 4 +- 11 files changed, 1994 insertions(+), 46 deletions(-) create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml create mode 100644 docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst diff --git a/.wordlist.txt b/.wordlist.txt index 7b592fc91..9d0f4d6cc 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -116,6 +116,7 @@ Deprecations DevCap DirectX Dockerfile +Dockerized Doxygen dropless ELMo @@ -361,6 +362,7 @@ PowerEdge PowerShell Pretrained Pretraining +Primus Profiler's PyPi Pytest @@ -525,6 +527,7 @@ Xilinx Xnack Xteam YAML +YAMLs YML YModel ZeRO @@ -585,6 +588,7 @@ completers composable concretization config +configs conformant constructible convolutional @@ -795,7 +799,9 @@ preprocessing preprocessor prequantized prerequisites +pretrain pretraining +primus profiler profilers protobuf diff --git a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml index 77eaa5ba0..c743e00b6 100644 --- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml @@ -1,26 +1,15 @@ dockers: - - pull_tag: rocm/megatron-lm:v25.6_py312 - docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0 + - pull_tag: rocm/megatron-lm:v25.7_py310 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a components: - ROCm: 6.4.1 - PyTorch: 2.8.0a0+git7d205b2 - Python: 3.12 - Transformer Engine: 2.1.0.dev0+8c4a512 - hipBLASLt: 393e413 - Triton: 3.3.0 - RCCL: 2.23.4.7a84c5d - doc_name: Ubuntu 24.04 + Python 3.12 - - pull_tag: rocm/megatron-lm:v25.6_py310 - docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6 - components: - ROCm: 6.4.1 - PyTorch: 2.8.0a0+git7d205b2 + ROCm: 6.4.2 + Primus: v0.1.0-rc1 + PyTorch: 2.8.0a0+gitd06a406 Python: "3.10" - Transformer Engine: 2.1.0.dev0+8c4a512 - hipBLASLt: 393e413 + Transformer Engine: 2.1.0.dev0+ba586519 + hipBLASLt: 37ba1d36 Triton: 3.3.0 - RCCL: 2.23.4.7a84c5d - doc_name: Ubuntu 22.04 + Python 3.10 + RCCL: 2.22.3 model_groups: - group: Meta Llama tag: llama diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml new file mode 100644 index 000000000..77eaa5ba0 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml @@ -0,0 +1,60 @@ +dockers: + - pull_tag: rocm/megatron-lm:v25.6_py312 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0 + components: + ROCm: 6.4.1 + PyTorch: 2.8.0a0+git7d205b2 + Python: 3.12 + Transformer Engine: 2.1.0.dev0+8c4a512 + hipBLASLt: 393e413 + Triton: 3.3.0 + RCCL: 2.23.4.7a84c5d + doc_name: Ubuntu 24.04 + Python 3.12 + - pull_tag: rocm/megatron-lm:v25.6_py310 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6 + components: + ROCm: 6.4.1 + PyTorch: 2.8.0a0+git7d205b2 + Python: "3.10" + Transformer Engine: 2.1.0.dev0+8c4a512 + hipBLASLt: 393e413 + Triton: 3.3.0 + RCCL: 2.23.4.7a84c5d + doc_name: Ubuntu 22.04 + Python 3.10 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: pyt_megatron_lm_train_llama-3.3-70b + - model: Llama 3.1 8B + mad_tag: pyt_megatron_lm_train_llama-3.1-8b + - model: Llama 3.1 70B + mad_tag: pyt_megatron_lm_train_llama-3.1-70b + - model: Llama 3.1 70B (proxy) + mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy + - model: Llama 2 7B + mad_tag: pyt_megatron_lm_train_llama-2-7b + - model: Llama 2 70B + mad_tag: pyt_megatron_lm_train_llama-2-70b + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy + - model: DeepSeek-V2-Lite + mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: pyt_megatron_lm_train_mixtral-8x7b + - model: Mixtral 8x22B (proxy) + mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: pyt_megatron_lm_train_qwen2.5-7b + - model: Qwen 2.5 72B + mad_tag: pyt_megatron_lm_train_qwen2.5-72b diff --git a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml new file mode 100644 index 000000000..fec474f59 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml @@ -0,0 +1,58 @@ +dockers: + - pull_tag: rocm/megatron-lm:v25.7_py310 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a + components: + ROCm: 6.4.2 + Primus: v0.1.0-rc1 + PyTorch: 2.8.0a0+gitd06a406 + Python: "3.10" + Transformer Engine: 2.1.0.dev0+ba586519 + hipBLASLt: 37ba1d36 + Triton: 3.3.0 + RCCL: 2.22.3 +model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b + config_name: llama3.3_70B-pretrain.yaml + - model: Llama 3.1 70B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b + config_name: llama3.1_70B-pretrain.yaml + - model: Llama 3.1 8B + mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b + config_name: llama3.1_8B-pretrain.yaml + - model: Llama 2 7B + mad_tag: primus_pyt_megatron_lm_train_llama-2-7b + config_name: llama2_7B-pretrain.yaml + - model: Llama 2 70B + mad_tag: primus_pyt_megatron_lm_train_llama-2-70b + config_name: llama2_70B-pretrain.yaml + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) + mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy + config_name: deepseek_v3-pretrain.yaml + - model: DeepSeek-V2-Lite + mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + config_name: deepseek_v2_lite-pretrain.yaml + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b + config_name: mixtral_8x7B_v0.1-pretrain.yaml + - model: Mixtral 8x22B (proxy) + mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + config_name: mixtral_8x22B_v0.1-pretrain.yaml + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b + config_name: primus_qwen2.5_7B-pretrain.yaml + - model: Qwen 2.5 72B + mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b + config_name: qwen2.5_72B-pretrain.yaml diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst index f9759c762..687cc514f 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst @@ -1,3 +1,5 @@ +:orphan: + .. meta:: :description: How to train a model using Megatron-LM for ROCm. :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch @@ -6,6 +8,14 @@ Training a model with Megatron-LM for ROCm ****************************************** +.. caution:: + + The ROCm Megatron-LM framework now has limited support with this Docker + environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`. + + To learn how to migrate your existing workloads to Primus with Megatron-Core, + see :doc:`previous-versions/megatron-lm-primus-migration-guide`. + The `Megatron-LM framework for ROCm `_ is a specialized fork of the robust Megatron-LM, designed to enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD @@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM utilities. It contains the following software components to accelerate training workloads: +.. note:: + + This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with + Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release `. + .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml {% set dockers = data.dockers %} - {% if dockers|length > 1 %} .. tab-set:: - {% for docker in data.dockers %} + {% for docker in dockers %} .. tab-item:: ``{{ docker.pull_tag }}`` :sync: {{ docker.pull_tag }} @@ -42,28 +56,14 @@ workloads: {% endfor %} {% endfor %} - {% elif dockers|length == 1 %} - .. list-table:: - :header-rows: 1 - - * - Software component - - Version - - {% for component_name, component_version in docker.components %} - * - {{ component_name }} - - {{ component_version }} - - {% endfor %} - {% endif %} .. _amd-megatron-lm-model-support: - The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. - Supported models ================ - The following models are supported for training performance benchmarking with Megatron-LM and ROCm. + The following models are supported for training performance benchmarking with Megatron-LM and ROCm + on AMD Instinct MI300X series accelerators. Some instructions, commands, and training recommendations in this documentation might vary by model -- select one to get started. @@ -177,7 +177,7 @@ Download the Docker image {% if dockers|length > 1 %} .. tab-set:: - {% for docker in data.dockers %} + {% for docker in dockers %} .. tab-item:: {{ docker.doc_name }} :sync: {{ docker.pull_tag }} @@ -227,10 +227,17 @@ Download the Docker image docker start megatron_training_env docker exec -it megatron_training_env bash -The Docker container includes a pre-installed, verified version of the ROCm -Megatron-LM development branch -``__, including necessary -training scripts. +4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support. + To roll back to using Megatron-LM, follow these steps: + + .. code-block:: shell + + cd /workspace/Megatron-LM/ + pip uninstall megatron-core + pip install -e . + +The Docker container hosts +``__ at verified commit ``e8e9edc``. .. _amd-megatron-lm-environment-setup: diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst index 9dd1c8f2c..f4ed199ef 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst @@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub ` + * `Docker Hub (py310) `__ + + * - v25.6 - * ROCm 6.4.1 * PyTorch 2.8.0a0+git7d205b2 - - * :doc:`Documentation <../megatron-lm>` + * :doc:`Documentation ` * `Docker Hub (py312) `__ * `Docker Hub (py310) `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst new file mode 100644 index 000000000..9275c1f39 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst @@ -0,0 +1,175 @@ +:orphan: + +********************************************************************** +Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM +********************************************************************** + +Primus supports Megatron-Core as backend optimization library, +replacing ROCm Megatron-LM. This document outlines the steps to migrate +workload from ROCm Megatron-LM to Primus with the Megatron-Core backend. + +Model architecture +================== + +ROCm Megatron-LM defines model architecture parameters in the training scripts; +for example, the Llama 3 8B model parameters are defined in +`examples/llama/train_llama3.sh `__ +as shown below: + +.. code-block:: bash + + HIDDEN_SIZE=4096 + FFN_HIDDEN_SIZE=14336 + NUM_LAYERS=32 + NUM_HEADS=32 + NUM_KV_HEADS=8 + +Primus defines the model architecture through model YAML configuration files +inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B +model architecture parameters are defined in +`primus/configs/models/megatron/llama3_8B.yaml `__ +as shown below: + +.. code-block:: yaml + + bases: + - llama3_base.yaml + + tokenizer_type: Llama3Tokenizer + tokenizer_model: meta-llama/Llama-3.1-8B + + ffn_hidden_size: 14336 + hidden_size: 4096 + num_attention_heads: 32 + num_layers: 32 + num_query_groups: 8 + +Primus' model config files follow a hierarchical design, meaning that new model +config YAMLs can inherit existing model config files by importing them as +bases. For example, +`llama3.1_8B.yaml `__ +uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below. +In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value: + +.. code-block:: yaml + + bases: + - llama3_8B.yaml + + tokenizer_type: Llama3Tokenizer + tokenizer_model: meta-llama/Llama-3.1-8B + + max_position_embeddings: 131072 + +.. tip:: + + Primus provides ``llama_base.yaml`` as the base configuration, which can be + used as bases for additional model architectures. For example, + `mixtral_base.yaml `__ + and + `deepseek_v3_base.yaml `__ + define ``llama_base.yaml`` as its base. + + .. code-block:: yaml + + # Example mixtral_base.yaml: + + bases: + - llama_base.yaml + + init_method_std: 0.01 + rotary_base: 1000000 + qk_layernorm: false + + group_query_attention: true + num_query_groups: 8 + + # moe parameters + num_experts: 8 + moe_router_topk: 2 + moe_router_load_balancing_type: aux_loss + moe_aux_loss_coeff: 1e-2 + moe_grouped_gemm: true + moe_token_dispatcher_type: alltoall + +It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new +category of model and define new models on top of it. For example, to add +Qwen2.5 models in Primus, we define +`qwen2.5_base.yaml `__ +and build +`qwen2.5_7B.yaml `__ +and +`qwen2.5_72B.yaml `__ +using ``qwen2.5_base.yaml`` as the base config. + +Training parameters +=================== + +ROCm Megatron-LM also defines the training parameters, like batch size, +tensor-parallelism, precision, as so on, in the training scripts. For example, +Llama3 8B model parameters are defined in +`examples/llama/train_llama3.sh `__ +as shown below: + +.. code-block:: bash + + TP="${TP:-8}" + PP="${PP:-1}" + CP="${CP:-1}" + MBS="${MBS:-1}" + BS="${BS:-8}" + +Primus defines the training parameters in top-level YAML files -- see +`examples/megatron/configs/ +`__. +For example, the `llama3.1_8B-pretrain.yaml +`__ +configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override +the default training parameters in ``llama3.1_8B-pretrain.yaml``. + +.. code-block:: yaml + + # model to run + model: llama3.1_8B.yaml # Model architecture yaml + overrides: + # log + # disable_wandb: false + # disable_tensorboard: false + stderr_sink_level: DEBUG + + log_avg_skip_iterations: 2 + log_avg_reset_interval: 50 + + train_iters: 50 + micro_batch_size: 2 + global_batch_size: 128 + + seq_length: 8192 + max_position_embeddings: 8192 + + lr: 1.0e-5 + min_lr: 0.0 + lr_warmup_iters: 2 + lr_decay_iters: null + lr_decay_style: cosine + weight_decay: 0.1 + adam_beta1: 0.9 + adam_beta2: 0.95 + eod_mask_loss: true + init_method_std: 0.008 + norm_epsilon: 1.0e-6 + +Backward compatibility with Megatron-LM +======================================= + +The Dockerized environment used for Primus maintains compatibility with Megatron-LM with +limited support. To roll back to using Megatron-LM, follow these steps. + +.. code-block:: shell + + cd /workspace/Megatron-LM/ + pip uninstall megatron-core + pip install -e . + +Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as +usual. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst new file mode 100644 index 000000000..32d72311b --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst @@ -0,0 +1,1041 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +****************************************** +Training a model with Megatron-LM for ROCm +****************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../megatron-lm` for the latest version. + +The `Megatron-LM framework for ROCm `__ is +a specialized fork of the robust Megatron-LM, designed to enable efficient +training of large-scale language models on AMD GPUs. By leveraging AMD +Instinctâ„¢ MI300X series accelerators, Megatron-LM delivers enhanced +scalability, performance, and resource utilization for AI workloads. It is +purpose-built to support models like Llama, DeepSeek, and Mixtral, +enabling developers to train next-generation AI models more +efficiently. + +AMD provides ready-to-use Docker images for MI300X series accelerators containing +essential components, including PyTorch, ROCm libraries, and Megatron-LM +utilities. It contains the following software components to accelerate training +workloads: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% if dockers|length > 1 %} + .. tab-set:: + + {% for docker in data.dockers %} + .. tab-item:: ``{{ docker.pull_tag }}`` + :sync: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + + {% endfor %} + {% endfor %} + {% elif dockers|length == 1 %} + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components %} + * - {{ component_name }} + - {{ component_version }} + + {% endfor %} + {% endif %} + + .. _amd-megatron-lm-model-support-v256: + + The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. + + Supported models + ================ + + The following models are supported for training performance benchmarking with Megatron-LM and ROCm. + Some instructions, commands, and training recommendations in this documentation might + vary by model -- select one to get started. + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +.. _amd-megatron-lm-performance-measurements-v256: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `__ +page provides reference throughput and latency measurements for training +popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `__ + only reflects the latest version of this training benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-megatron-lm-training-v256: + +Environment setup +================= + +Use the following instructions to set up the environment, configure the script to train models, and +reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker +image. + +.. _amd-megatron-lm-requirements-v256: + +Download the Docker image +------------------------- + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml + + {% set dockers = data.dockers %} + 1. Use the following command to pull the Docker image from Docker Hub. + + {% if dockers|length > 1 %} + .. tab-set:: + + {% for docker in data.dockers %} + .. tab-item:: {{ docker.doc_name }} + :sync: {{ docker.pull_tag }} + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + {% endfor %} + {% elif dockers|length == 1 %} + {% set docker = dockers[0] %} + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + {% endif %} + 2. Launch the Docker container. + + {% if dockers|length > 1 %} + .. tab-set:: + + {% for docker in data.dockers %} + .. tab-item:: {{ docker.doc_name }} + :sync: {{ docker.pull_tag }} + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} + + {% endfor %} + {% elif dockers|length == 1 %} + {% set docker = dockers[0] %} + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} + + {% endif %} + +3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start megatron_training_env + docker exec -it megatron_training_env bash + +The Docker container includes a pre-installed, verified version of the ROCm +Megatron-LM development branch +``__, including necessary +training scripts. + +.. _amd-megatron-lm-environment-setup-v256: + +Configuration +============= + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b + + Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Network interface +----------------- + +Update the network interface in the script to match your system's network interface. To +find your network interface, run the following (outside of any Docker container): + +.. code-block:: bash + + ip a + +Look for an active interface that has an IP address in the same subnet as +your other nodes. Then, update the following variables in the script, for +example: + +.. code-block:: bash + + export NCCL_SOCKET_IFNAME=ens50f0np0 + + export GLOO_SOCKET_IFNAME=ens50f0np0 + +.. _amd-megatron-lm-tokenizer-v256: + +Tokenizer +--------- + +You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples. +If the tokenizer is not found, it'll be downloaded if publicly available. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + If you do not have Llama 3.3 tokenizer locally, you need to use your + personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer. + See `Llama-3.3-70B-Instruct + `_. After you are + authorized, use your ``HF_TOKEN`` to download the tokenizer and set the + variable ``TOKENIZER_MODEL`` to the tokenizer path. + + .. code-block:: shell + + export HF_TOKEN= + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-8B" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-70B" + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite" + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Download the Mixtral tokenizer. + + .. code-block:: shell + + mkdir tokenizer + cd tokenizer + export HF_TOKEN= + wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model + + Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL=tokenizer/tokenizer.model + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-7B" + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-72B" + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default + value is ``1`` for enabled. + + .. code-block:: bash + + MOCK_DATA=1 + +* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 + + DATA_PATH="/data/bookcorpus_text_sentence" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Download the dataset +^^^^^^^^^^^^^^^^^^^^ + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy + + For Llama models, use the `prepare_dataset.sh + `_ script + to prepare your dataset. + To download the dataset, set the ``DATASET`` variable to the dataset you'd + like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and + ``DATASET=bookcorpus``. + + .. code-block:: shell + + DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset + DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset + + ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. + Remember to either pre-download the tokenizer or setup Hugging Face access + otherwise when needed -- see the :ref:`Tokenizer ` section. + + .. note:: + + When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx`` + as in the following example: + + .. code-block:: shell + + DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir mixtral-datasets + cd mixtral-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/mixtral-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir -p temp/qwen-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/qwen-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Multi-node configuration +------------------------ + +If you're running multi-node training, update the following environment variables. They can +also be passed as command line arguments. Refer to the following example configurations. + +* Change ``localhost`` to the master node's hostname: + + .. code-block:: shell + + MASTER_ADDR="${MASTER_ADDR:-localhost}" + +* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``): + + .. code-block:: shell + + NNODES="${NNODES:-1}" + +* Set the rank of each node (0 for master, 1 for the first worker node, and so on): + + .. code-block:: shell + + NODE_RANK="${NODE_RANK:-0}" + +* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an + NFS directory) for multi-node runs: + + .. code-block:: shell + + DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs + +* For multi-node runs, make sure the correct network drivers are installed on the nodes. If + inside a Docker container, either install the drivers inside the Docker container or pass the network + drivers from the host while creating the Docker container. + + .. code-block:: shell + + # Specify which RDMA interfaces to use for communication + export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + +.. _amd-megatron-lm-run-training-v256: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +MI300X series accelerators with the AMD Megatron-LM environment. + +Single node training +-------------------- + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MBS=2 \ + BS=16 \ + TE_FP8=0 \ + TP=1 \ + PP=1 \ + FSDP=1 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + For Llama 3.1 8B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy + + To run the training on a single node for Llama 3.1 70B with proxy, use the following command. + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + NUM_LAYERS=40 \ + bash examples/llama/train_llama3.sh + + .. note:: + + Use two or more nodes to run the *full* Llama 70B model with FP8 precision. + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b + + To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + For Llama 2 7B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-2-70b + + To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=7 \ + BS=56 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + FORCE_BALANCE=true \ + RUN_ENV=cluster \ + MODEL_SIZE=671B \ + TRAIN_ITERS=50 \ + SEQ_LEN=4096 \ + NUM_LAYERS=3 \ + MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \ + PR=bf16 \ + TP=1 PP=1 ETP=1 EP=8 \ + GEMM_TUNING=1 \ + NVTE_CK_USES_BWD_V3=1 \ + USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \ + GPT_LAYER_IN_TE=true \ + bash examples/deepseek_v3/train_deepseekv3.sh + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + export NVTE_FUSED_ATTN_CK=0 + GEMM_TUNING=1 \ + PR=bf16 \ + MBS=4 \ + AC=none \ + SEQ_LEN=4096 \ + PAD_LEN=4096 \ + TRAIN_ITERS=50 \ + bash examples/deepseek_v2/train_deepseekv2.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=0 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=none \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=4096 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x7B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=4 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=full \ + NUM_LAYERS=4 \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=8192 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x22B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=0 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + + For FP8, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=1 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + FSDP=1 \ + CP=1 \ + PP=1 \ + MBS=3 \ + BS=24 \ + TE_FP8=0 \ + MODEL_SIZE=72 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-72B \ + RECOMPUTE_ACTIVATIONS=full \ + CKPT_FORMAT=torch_dist + +Multi-node training examples +---------------------------- + +To run training on multiple nodes, launch the Docker container on each node. +For example, for Llama 3 using a two node setup (``NODE0`` as the master node), +use these commands. + +* On the master node ``NODE0``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=0 \ + bash examples/llama/train_llama3.sh + +* On the worker node ``NODE1``: + + .. code-block:: shell + + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=1 \ + bash examples/llama/train_llama3.sh + +Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is +provided in +``__ to +enable training at scale under a SLURM environment. For example, to run +training on 16 nodes, try the following command: + +.. code-block:: shell + + sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh + +.. _amd-megatron-lm-benchmark-test-vars-v256: + +Key options +----------- + +The benchmark tests support the following sets of variables. + +``TEE_OUTPUT`` + ``1`` to enable training logs or ``0`` to disable. + +``TE_FP8`` + ``0`` for B16 or ``1`` for FP8 -- ``0`` by default. + +``GEMM_TUNING`` + ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels. + +``USE_FLASH_ATTN`` + ``1`` to enable Flash Attention. + +``FSDP`` + ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``, + ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled. + +``ENABLE_PROFILING`` + ``1`` to enable PyTorch profiling for performance analysis. + +``transformer-impl`` + ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE. + +``MODEL_SIZE`` + ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example. + +``TOTAL_ITERS`` + The total number of iterations -- ``10`` by default. + +``MOCK_DATA`` + ``1`` to use mock data or ``0`` to use real data you provide. + +``MBS`` + Micro batch size. + +``BS`` + Global batch size. + +``TP`` / ``TP_SIZE`` + Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on. + +``EP`` / ``EP_SIZE`` + Expert parallel for MoE models. + +``SEQ_LENGTH`` + Input sequence length. + +``PR`` + Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs. + +``AC`` + Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default. + +``NUM_LAYERS`` + Use reduced number of layers as a proxy model. + +``RECOMPUTE_NUM_LAYERS`` + Number of layers used for checkpointing recompute. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst new file mode 100644 index 000000000..0a80c7c9b --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst @@ -0,0 +1,602 @@ +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +********************************************** +Training a model with Primus and Megatron-Core +********************************************** + +`Primus `__ is a unified and flexible +LLM training framework designed to streamline training. It streamlines LLM +training on AMD Instinct accelerators using a modular, reproducible configuration paradigm. +Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core. + +.. note:: + + Primus with the Megatron-Core backend is intended to replace ROCm + Megatron-LM in this Dockerized training environment. To learn how to migrate + workloads from Megatron-LM to Primus with Megatron-Core, see + :doc:`previous-versions/megatron-lm-primus-migration-guide`. + +For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators +containing essential components for Primus and Megatron-Core. + +.. note:: + + This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with + Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release `. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% set docker = dockers[0] %} + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} + +.. _amd-primus-megatron-lm-model-support: + +Supported models +================ + +The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. +Some instructions, commands, and training examples in this documentation might +vary by model -- select one to get started. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-primus-megatron-lm-training: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% set docker = dockers[0] %} + + Environment setup + ================= + + Use the following instructions to set up the environment, configure the script to train models, and + reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image. + + .. _amd-primus-megatron-lm-requirements: + + Download the Docker image + ------------------------- + + 1. Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ docker.pull_tag }} + + 2. Launch the Docker container. + + .. code-block:: shell + + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + --shm-size 128G \ + --name primus_training_env \ + {{ docker.pull_tag }} + +3. Use these commands if you exit the ``primus_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start primus_training_env + docker exec -it primus_training_env bash + +The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus +`__ repository. + +.. _amd-primus-megatron-lm-environment-setup: + +Configuration +============= + +Primus defines a training configuration in YAML for each model in +`examples/megatron/configs `__. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set model_groups = data.model_groups %} + {% for model_group in model_groups %} + {% for model in model_group.models %} + .. container:: model-doc {{ model.mad_tag }} + + To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``. + Note that training configuration YAML files for other models follow this naming convention. + + {% endfor %} + {% endfor %} + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default + value is ``true`` for enabled. + + .. code-block:: yaml + + mock_data: true + +* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset. + + .. code-block:: bash + + mock_data: false + train_data_path: /path/to/your/dataset + + Ensure that the files are accessible inside the Docker container. + +.. _amd-primus-megatron-lm-tokenizer: + +Tokenizer +--------- + +In Primus, each model uses a tokenizer from Hugging Face. For example, Llama +3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and +``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model +`__ +definition. As such, you need to set the ``HF_TOKEN`` environment variable with +right permissions to access the tokenizer for each model. + +.. code-block:: bash + + # Export your HF_TOKEN in the workspace + export HF_TOKEN= + +.. _amd-primus-megatron-lm-run-training: + +Run training +============ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +MI300X series accelerators with the AMD Megatron-LM environment. + +Single node training +-------------------- + +To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command: + +.. code-block:: shell + + pip install -r requirements.txt + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + +Once setup is complete, run the appropriate training command. + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + To run pre-training for Llama 3.3 70B BF16, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 16 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + To run pre-training for Llama 3.1 8B FP8, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + For Llama 3.1 8B BF16, use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_pretrain.sh --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + To run pre-training for Llama 3.1 70B BF16, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 + + To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --num_layers 40 \ + --fp8 hybrid \ + --no_fp8_weight_transpose_cache true + + .. note:: + + Use two or more nodes to run the *full* Llama 70B model with FP8 precision. + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + To run pre-training for Llama 2 7B FP8, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + + To run pre-training for Llama 2 7B BF16, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \ + bash ./examples/run_pretrain.sh --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + To run pre-training for Llama 2 70B BF16, run: + + .. code-block:: shell + + EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_pretrain.sh --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, + use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 3 \ + --moe_layer_freq 1 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel), + use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --global_batch_size 256 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy, + use the following command: + + .. code-block:: shell + + EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --num_layers 4 \ + --pipeline_model_parallel_size 1 \ + --micro_batch_size 1 \ + --global_batch_size 16 \ + --train_iters 50 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command: + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh --train_iters 50 + + For FP8, use the following command. + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \ + bash examples/run_pretrain.sh \ + --train_iters 50 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. code-block:: shell + + EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_pretrain.sh --train_iters 50 + +Multi-node training examples +---------------------------- + +To run training on multiple nodes, you can use the +`run_slurm_pretrain.sh `__ +to launch the multi-node workload. Use the following steps to setup your environment: + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml + + {% set dockers = data.dockers %} + {% set docker = dockers[0] %} + + .. code-block:: shell + + cd /workspace/Primus/ + export DOCKER_IMAGE={{ docker.pull_tag }} + export HF_TOKEN= + export HSA_NO_SCRATCH_RECLAIM=1 + export NVTE_CK_USES_BWD_V3=1 + export NCCL_IB_HCA= # specify which RDMA interfaces to use for communication + export NCCL_SOCKET_IFNAME= # your Network Interface + export GLOO_SOCKET_IFNAME= # your Network Interface + export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE + +.. note:: + + * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container. + * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster. + * To find your network interface, you can use ``ip a``. + * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices. + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b + + To train Llama 3.3 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.3 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b + + To train Llama 3.1 8B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --global_batch_size 1024 \ + --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b + + To train Llama 3.1 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 4 \ + --global_batch_size 256 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 3.1 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 1 \ + --global_batch_size 256 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b + + To train Llama 2 8B FP8 on 8 nodes, run: + + .. code-block:: shell + + # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case + NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid + +.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b + + To train Llama 2 70B FP8 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 10 \ + --global_batch_size 640 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + + To train Llama 2 70B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \ + bash ./examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 1536 \ + --recompute_num_layers 12 + +.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b + + To train Mixtral 8x7B BF16 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 2 \ + --global_batch_size 256 + +.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b + + To train Qwen2.5 72B FP8 on 8 nodes, run: + + .. code-block:: shell + + NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \ + bash examples/run_slurm_pretrain.sh \ + --micro_batch_size 8 \ + --global_batch_size 512 \ + --recompute_num_layers 80 \ + --no_fp8_weight_transpose_cache true \ + --fp8 hybrid + +.. _amd-primus-megatron-lm-benchmark-test-vars: + +Key options +----------- + +The following are key options to take note of + +fp8 + ``hybrid`` enables FP8 GEMMs. + +use_torch_fsdp2 + ``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled, + set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``. + +profile + To enable PyTorch profiling, set these parameters: + + .. code-block:: yaml + + profile: true + use_pytorch_profiler: true + profile_step_end: 7 + profile_step_start: 6 + +train_iters + The total number of iterations (default: 50). + +mock_data + True by default. + +micro_batch_size + Micro batch size. + +global_batch_size + Global batch size. + +recompute_granularity + For activation checkpointing. + +num_layers + For using a reduced number of layers as with proxy models. + +Previous versions +================= + +See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. + +This training environment now uses Primus with Megatron as the primary +configuration. Limited support for the legacy ROCm Megatron-LM is still +available. For instructions on using ROCm Megatron-LM, see the +:doc:`megatron-lm` document. diff --git a/docs/how-to/rocm-for-ai/training/index.rst b/docs/how-to/rocm-for-ai/training/index.rst index 13213c2e9..7f2ce1d97 100644 --- a/docs/how-to/rocm-for-ai/training/index.rst +++ b/docs/how-to/rocm-for-ai/training/index.rst @@ -21,6 +21,8 @@ In this guide, you'll learn about: - Training a model + - :doc:`With Primus (Megatron-LM backend) ` + - :doc:`With Megatron-LM ` - :doc:`With PyTorch ` diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 8560f0c68..db786f0c4 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -44,8 +44,8 @@ subtrees: title: Training subtrees: - entries: - - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst - title: Train a model with Megatron-LM + - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst + title: Train a model with Primus and Megatron-Core - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst title: Train a model with PyTorch - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst