mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-09 22:58:17 -05:00
docs: Add Primus (Megatron) training Docker documentation (#5218)
This commit is contained in:
@@ -116,6 +116,7 @@ Deprecations
|
|||||||
DevCap
|
DevCap
|
||||||
DirectX
|
DirectX
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
Dockerized
|
||||||
Doxygen
|
Doxygen
|
||||||
dropless
|
dropless
|
||||||
ELMo
|
ELMo
|
||||||
@@ -361,6 +362,7 @@ PowerEdge
|
|||||||
PowerShell
|
PowerShell
|
||||||
Pretrained
|
Pretrained
|
||||||
Pretraining
|
Pretraining
|
||||||
|
Primus
|
||||||
Profiler's
|
Profiler's
|
||||||
PyPi
|
PyPi
|
||||||
Pytest
|
Pytest
|
||||||
@@ -525,6 +527,7 @@ Xilinx
|
|||||||
Xnack
|
Xnack
|
||||||
Xteam
|
Xteam
|
||||||
YAML
|
YAML
|
||||||
|
YAMLs
|
||||||
YML
|
YML
|
||||||
YModel
|
YModel
|
||||||
ZeRO
|
ZeRO
|
||||||
@@ -585,6 +588,7 @@ completers
|
|||||||
composable
|
composable
|
||||||
concretization
|
concretization
|
||||||
config
|
config
|
||||||
|
configs
|
||||||
conformant
|
conformant
|
||||||
constructible
|
constructible
|
||||||
convolutional
|
convolutional
|
||||||
@@ -795,7 +799,9 @@ preprocessing
|
|||||||
preprocessor
|
preprocessor
|
||||||
prequantized
|
prequantized
|
||||||
prerequisites
|
prerequisites
|
||||||
|
pretrain
|
||||||
pretraining
|
pretraining
|
||||||
|
primus
|
||||||
profiler
|
profiler
|
||||||
profilers
|
profilers
|
||||||
protobuf
|
protobuf
|
||||||
|
|||||||
@@ -1,26 +1,15 @@
|
|||||||
dockers:
|
dockers:
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py312
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
components:
|
components:
|
||||||
ROCm: 6.4.1
|
ROCm: 6.4.2
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
Primus: v0.1.0-rc1
|
||||||
Python: 3.12
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
|
||||||
hipBLASLt: 393e413
|
|
||||||
Triton: 3.3.0
|
|
||||||
RCCL: 2.23.4.7a84c5d
|
|
||||||
doc_name: Ubuntu 24.04 + Python 3.12
|
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py310
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
|
||||||
components:
|
|
||||||
ROCm: 6.4.1
|
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
hipBLASLt: 393e413
|
hipBLASLt: 37ba1d36
|
||||||
Triton: 3.3.0
|
Triton: 3.3.0
|
||||||
RCCL: 2.23.4.7a84c5d
|
RCCL: 2.22.3
|
||||||
doc_name: Ubuntu 22.04 + Python 3.10
|
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py312
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: 3.12
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 24.04 + Python 3.12
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 22.04 + Python 3.10
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 3.1 70B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.2
|
||||||
|
Primus: v0.1.0-rc1
|
||||||
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
|
hipBLASLt: 37ba1d36
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.22.3
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
config_name: llama3.3_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
config_name: llama3.1_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
config_name: llama3.1_8B-pretrain.yaml
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
config_name: llama2_7B-pretrain.yaml
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
config_name: llama2_70B-pretrain.yaml
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
config_name: deepseek_v3-pretrain.yaml
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
config_name: deepseek_v2_lite-pretrain.yaml
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
config_name: qwen2.5_72B-pretrain.yaml
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to train a model using Megatron-LM for ROCm.
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
@@ -6,6 +8,14 @@
|
|||||||
Training a model with Megatron-LM for ROCm
|
Training a model with Megatron-LM for ROCm
|
||||||
******************************************
|
******************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
The ROCm Megatron-LM framework now has limited support with this Docker
|
||||||
|
environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
|
||||||
|
|
||||||
|
To learn how to migrate your existing workloads to Primus with Megatron-Core,
|
||||||
|
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||||
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||||
training of large-scale language models on AMD GPUs. By leveraging AMD
|
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||||
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
|||||||
utilities. It contains the following software components to accelerate training
|
utilities. It contains the following software components to accelerate training
|
||||||
workloads:
|
workloads:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set dockers = data.dockers %}
|
{% set dockers = data.dockers %}
|
||||||
{% if dockers|length > 1 %}
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: ``{{ docker.pull_tag }}``
|
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -42,28 +56,14 @@ workloads:
|
|||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% elif dockers|length == 1 %}
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in docker.components %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
|
The following models are supported for training performance benchmarking with Megatron-LM and ROCm
|
||||||
|
on AMD Instinct MI300X series accelerators.
|
||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ Download the Docker image
|
|||||||
{% if dockers|length > 1 %}
|
{% if dockers|length > 1 %}
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: {{ docker.doc_name }}
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -227,10 +227,17 @@ Download the Docker image
|
|||||||
docker start megatron_training_env
|
docker start megatron_training_env
|
||||||
docker exec -it megatron_training_env bash
|
docker exec -it megatron_training_env bash
|
||||||
|
|
||||||
The Docker container includes a pre-installed, verified version of the ROCm
|
4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
|
||||||
Megatron-LM development branch
|
To roll back to using Megatron-LM, follow these steps:
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
|
|
||||||
training scripts.
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
The Docker container hosts
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
|||||||
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.6 (latest)
|
* - v25.7 (latest)
|
||||||
|
-
|
||||||
|
* ROCm
|
||||||
|
* PyTorch
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../megatron-lm>`
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
|
||||||
|
|
||||||
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../megatron-lm>`
|
* :doc:`Documentation <megatron-lm-v25.6>`
|
||||||
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
||||||
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,175 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
**********************************************************************
|
||||||
|
Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
|
||||||
|
**********************************************************************
|
||||||
|
|
||||||
|
Primus supports Megatron-Core as backend optimization library,
|
||||||
|
replacing ROCm Megatron-LM. This document outlines the steps to migrate
|
||||||
|
workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
|
||||||
|
|
||||||
|
Model architecture
|
||||||
|
==================
|
||||||
|
|
||||||
|
ROCm Megatron-LM defines model architecture parameters in the training scripts;
|
||||||
|
for example, the Llama 3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
HIDDEN_SIZE=4096
|
||||||
|
FFN_HIDDEN_SIZE=14336
|
||||||
|
NUM_LAYERS=32
|
||||||
|
NUM_HEADS=32
|
||||||
|
NUM_KV_HEADS=8
|
||||||
|
|
||||||
|
Primus defines the model architecture through model YAML configuration files
|
||||||
|
inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
|
||||||
|
model architecture parameters are defined in
|
||||||
|
`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_base.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
ffn_hidden_size: 14336
|
||||||
|
hidden_size: 4096
|
||||||
|
num_attention_heads: 32
|
||||||
|
num_layers: 32
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
Primus' model config files follow a hierarchical design, meaning that new model
|
||||||
|
config YAMLs can inherit existing model config files by importing them as
|
||||||
|
bases. For example,
|
||||||
|
`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
|
||||||
|
In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_8B.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
max_position_embeddings: 131072
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
|
||||||
|
Primus provides ``llama_base.yaml`` as the base configuration, which can be
|
||||||
|
used as bases for additional model architectures. For example,
|
||||||
|
`mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
|
||||||
|
and
|
||||||
|
`deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
|
||||||
|
define ``llama_base.yaml`` as its base.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# Example mixtral_base.yaml:
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama_base.yaml
|
||||||
|
|
||||||
|
init_method_std: 0.01
|
||||||
|
rotary_base: 1000000
|
||||||
|
qk_layernorm: false
|
||||||
|
|
||||||
|
group_query_attention: true
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
# moe parameters
|
||||||
|
num_experts: 8
|
||||||
|
moe_router_topk: 2
|
||||||
|
moe_router_load_balancing_type: aux_loss
|
||||||
|
moe_aux_loss_coeff: 1e-2
|
||||||
|
moe_grouped_gemm: true
|
||||||
|
moe_token_dispatcher_type: alltoall
|
||||||
|
|
||||||
|
It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
|
||||||
|
category of model and define new models on top of it. For example, to add
|
||||||
|
Qwen2.5 models in Primus, we define
|
||||||
|
`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
|
||||||
|
and build
|
||||||
|
`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
|
||||||
|
and
|
||||||
|
`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
|
||||||
|
using ``qwen2.5_base.yaml`` as the base config.
|
||||||
|
|
||||||
|
Training parameters
|
||||||
|
===================
|
||||||
|
|
||||||
|
ROCm Megatron-LM also defines the training parameters, like batch size,
|
||||||
|
tensor-parallelism, precision, as so on, in the training scripts. For example,
|
||||||
|
Llama3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
TP="${TP:-8}"
|
||||||
|
PP="${PP:-1}"
|
||||||
|
CP="${CP:-1}"
|
||||||
|
MBS="${MBS:-1}"
|
||||||
|
BS="${BS:-8}"
|
||||||
|
|
||||||
|
Primus defines the training parameters in top-level YAML files -- see
|
||||||
|
`examples/megatron/configs/
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
For example, the `llama3.1_8B-pretrain.yaml
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||||
|
configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
|
||||||
|
the default training parameters in ``llama3.1_8B-pretrain.yaml``.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# model to run
|
||||||
|
model: llama3.1_8B.yaml # Model architecture yaml
|
||||||
|
overrides:
|
||||||
|
# log
|
||||||
|
# disable_wandb: false
|
||||||
|
# disable_tensorboard: false
|
||||||
|
stderr_sink_level: DEBUG
|
||||||
|
|
||||||
|
log_avg_skip_iterations: 2
|
||||||
|
log_avg_reset_interval: 50
|
||||||
|
|
||||||
|
train_iters: 50
|
||||||
|
micro_batch_size: 2
|
||||||
|
global_batch_size: 128
|
||||||
|
|
||||||
|
seq_length: 8192
|
||||||
|
max_position_embeddings: 8192
|
||||||
|
|
||||||
|
lr: 1.0e-5
|
||||||
|
min_lr: 0.0
|
||||||
|
lr_warmup_iters: 2
|
||||||
|
lr_decay_iters: null
|
||||||
|
lr_decay_style: cosine
|
||||||
|
weight_decay: 0.1
|
||||||
|
adam_beta1: 0.9
|
||||||
|
adam_beta2: 0.95
|
||||||
|
eod_mask_loss: true
|
||||||
|
init_method_std: 0.008
|
||||||
|
norm_epsilon: 1.0e-6
|
||||||
|
|
||||||
|
Backward compatibility with Megatron-LM
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
|
||||||
|
limited support. To roll back to using Megatron-LM, follow these steps.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
|
||||||
|
usual.
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,602 @@
|
|||||||
|
.. meta::
|
||||||
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
|
|
||||||
|
**********************************************
|
||||||
|
Training a model with Primus and Megatron-Core
|
||||||
|
**********************************************
|
||||||
|
|
||||||
|
`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
|
||||||
|
LLM training framework designed to streamline training. It streamlines LLM
|
||||||
|
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||||
|
Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Primus with the Megatron-Core backend is intended to replace ROCm
|
||||||
|
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||||
|
workloads from Megatron-LM to Primus with Megatron-Core, see
|
||||||
|
:doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
|
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||||
|
containing essential components for Primus and Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-model-support:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
Some instructions, commands, and training examples in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. _mi300x-amd-primus-megatron-lm-training:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
|
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-requirements:
|
||||||
|
|
||||||
|
Download the Docker image
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name primus_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start primus_training_env
|
||||||
|
docker exec -it primus_training_env bash
|
||||||
|
|
||||||
|
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Primus defines a training configuration in YAML for each model in
|
||||||
|
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
|
||||||
|
Note that training configuration YAML files for other models follow this naming convention.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||||
|
|
||||||
|
Dataset options
|
||||||
|
---------------
|
||||||
|
|
||||||
|
You can use either mock data or real data for training.
|
||||||
|
|
||||||
|
* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
|
||||||
|
value is ``true`` for enabled.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
mock_data: true
|
||||||
|
|
||||||
|
* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
mock_data: false
|
||||||
|
train_data_path: /path/to/your/dataset
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-tokenizer:
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
---------
|
||||||
|
|
||||||
|
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||||
|
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||||
|
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||||
|
right permissions to access the tokenizer for each model.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Export your HF_TOKEN in the workspace
|
||||||
|
export HF_TOKEN=<your_hftoken>
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-run-training:
|
||||||
|
|
||||||
|
Run training
|
||||||
|
============
|
||||||
|
|
||||||
|
Use the following example commands to set up the environment, configure
|
||||||
|
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||||
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
|
Single node training
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
|
||||||
|
Once setup is complete, run the appropriate training command.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.3 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 8B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--num_layers 40 \
|
||||||
|
--fp8 hybrid \
|
||||||
|
--no_fp8_weight_transpose_cache true
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 3 \
|
||||||
|
--moe_layer_freq 1 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 4 \
|
||||||
|
--pipeline_model_parallel_size 1 \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
|
command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
For FP8, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
Multi-node training examples
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To run training on multiple nodes, you can use the
|
||||||
|
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||||
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Primus/
|
||||||
|
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||||
|
export HF_TOKEN=<your_HF_token>
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||||
|
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
* Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
|
||||||
|
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||||
|
* To find your network interface, you can use ``ip a``.
|
||||||
|
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--global_batch_size 1024 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To train Llama 2 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 10 \
|
||||||
|
--global_batch_size 640 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 1536 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 256
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 8 \
|
||||||
|
--global_batch_size 512 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-benchmark-test-vars:
|
||||||
|
|
||||||
|
Key options
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The following are key options to take note of
|
||||||
|
|
||||||
|
fp8
|
||||||
|
``hybrid`` enables FP8 GEMMs.
|
||||||
|
|
||||||
|
use_torch_fsdp2
|
||||||
|
``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled,
|
||||||
|
set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
|
||||||
|
|
||||||
|
profile
|
||||||
|
To enable PyTorch profiling, set these parameters:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
profile: true
|
||||||
|
use_pytorch_profiler: true
|
||||||
|
profile_step_end: 7
|
||||||
|
profile_step_start: 6
|
||||||
|
|
||||||
|
train_iters
|
||||||
|
The total number of iterations (default: 50).
|
||||||
|
|
||||||
|
mock_data
|
||||||
|
True by default.
|
||||||
|
|
||||||
|
micro_batch_size
|
||||||
|
Micro batch size.
|
||||||
|
|
||||||
|
global_batch_size
|
||||||
|
Global batch size.
|
||||||
|
|
||||||
|
recompute_granularity
|
||||||
|
For activation checkpointing.
|
||||||
|
|
||||||
|
num_layers
|
||||||
|
For using a reduced number of layers as with proxy models.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/megatron-lm`` Docker image.
|
||||||
|
|
||||||
|
This training environment now uses Primus with Megatron as the primary
|
||||||
|
configuration. Limited support for the legacy ROCm Megatron-LM is still
|
||||||
|
available. For instructions on using ROCm Megatron-LM, see the
|
||||||
|
:doc:`megatron-lm` document.
|
||||||
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
|
|||||||
|
|
||||||
- Training a model
|
- Training a model
|
||||||
|
|
||||||
|
- :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
|
||||||
|
|
||||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||||
|
|
||||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||||
|
|||||||
@@ -44,8 +44,8 @@ subtrees:
|
|||||||
title: Training
|
title: Training
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Megatron-LM
|
title: Train a model with Primus and Megatron-Core
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||||
title: Train a model with PyTorch
|
title: Train a model with PyTorch
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
|
|||||||
Reference in New Issue
Block a user