diff --git a/.wordlist.txt b/.wordlist.txt index 1d332031a..42fbdd16e 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -713,11 +713,13 @@ linearized linter linux llvm +lm localscratch logits lossy macOS matchers +megatron microarchitecture migraphx migratable @@ -789,6 +791,7 @@ quantile quantizer quasirandom queueing +qwen radeon rccl rdc diff --git a/docs/conf.py b/docs/conf.py index dfd48d80c..db73355ed 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -105,10 +105,22 @@ article_pages = [ {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]}, - {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]}, - {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]}, - {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]}, @@ -120,7 +132,16 @@ article_pages = [ {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml index ab1996911..77eaa5ba0 100644 --- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml @@ -1,29 +1,60 @@ -megatron-lm_benchmark: - model_groups: - - group: Meta Llama - tag: llama - models: +dockers: + - pull_tag: rocm/megatron-lm:v25.6_py312 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0 + components: + ROCm: 6.4.1 + PyTorch: 2.8.0a0+git7d205b2 + Python: 3.12 + Transformer Engine: 2.1.0.dev0+8c4a512 + hipBLASLt: 393e413 + Triton: 3.3.0 + RCCL: 2.23.4.7a84c5d + doc_name: Ubuntu 24.04 + Python 3.12 + - pull_tag: rocm/megatron-lm:v25.6_py310 + docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6 + components: + ROCm: 6.4.1 + PyTorch: 2.8.0a0+git7d205b2 + Python: "3.10" + Transformer Engine: 2.1.0.dev0+8c4a512 + hipBLASLt: 393e413 + Triton: 3.3.0 + RCCL: 2.23.4.7a84c5d + doc_name: Ubuntu 22.04 + Python 3.10 +model_groups: + - group: Meta Llama + tag: llama + models: - model: Llama 3.3 70B mad_tag: pyt_megatron_lm_train_llama-3.3-70b - model: Llama 3.1 8B mad_tag: pyt_megatron_lm_train_llama-3.1-8b - model: Llama 3.1 70B mad_tag: pyt_megatron_lm_train_llama-3.1-70b + - model: Llama 3.1 70B (proxy) + mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy - model: Llama 2 7B mad_tag: pyt_megatron_lm_train_llama-2-7b - model: Llama 2 70B mad_tag: pyt_megatron_lm_train_llama-2-70b - - group: DeepSeek - tag: deepseek - models: - - model: DeepSeek-V3 + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 (proxy) mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy - model: DeepSeek-V2-Lite mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b - - group: Mistral AI - tag: mistral - models: + - group: Mistral AI + tag: mistral + models: - model: Mixtral 8x7B mad_tag: pyt_megatron_lm_train_mixtral-8x7b - - model: Mixtral 8x22B + - model: Mixtral 8x22B (proxy) mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy + - group: Qwen + tag: qwen + models: + - model: Qwen 2.5 7B + mad_tag: pyt_megatron_lm_train_qwen2.5-7b + - model: Qwen 2.5 72B + mad_tag: pyt_megatron_lm_train_qwen2.5-72b diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml new file mode 100644 index 000000000..ab1996911 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml @@ -0,0 +1,29 @@ +megatron-lm_benchmark: + model_groups: + - group: Meta Llama + tag: llama + models: + - model: Llama 3.3 70B + mad_tag: pyt_megatron_lm_train_llama-3.3-70b + - model: Llama 3.1 8B + mad_tag: pyt_megatron_lm_train_llama-3.1-8b + - model: Llama 3.1 70B + mad_tag: pyt_megatron_lm_train_llama-3.1-70b + - model: Llama 2 7B + mad_tag: pyt_megatron_lm_train_llama-2-7b + - model: Llama 2 70B + mad_tag: pyt_megatron_lm_train_llama-2-70b + - group: DeepSeek + tag: deepseek + models: + - model: DeepSeek-V3 + mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy + - model: DeepSeek-V2-Lite + mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b + - group: Mistral AI + tag: mistral + models: + - model: Mixtral 8x7B + mad_tag: pyt_megatron_lm_train_mixtral-8x7b + - model: Mixtral 8x22B + mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst index 42df5e895..b26cc522a 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst @@ -17,6 +17,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub 1 %} + .. tab-set:: + + {% for docker in data.dockers %} + .. tab-item:: ``{{ docker.pull_tag }}`` + :sync: {{ docker.pull_tag }} + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + + {% endfor %} + {% endfor %} + {% elif dockers|length == 1 %} + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components %} + * - {{ component_name }} + - {{ component_version }} + + {% endfor %} + {% endif %} + + .. _amd-megatron-lm-model-support: + + The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. + Supported models ================ @@ -73,8 +67,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se Some instructions, commands, and training recommendations in this documentation might vary by model -- select one to get started. - {% set model_groups = data["megatron-lm_benchmark"].model_groups %} - + {% set model_groups = data.model_groups %} .. raw:: html
@@ -82,7 +75,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
Model
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %}
@@ -155,42 +148,77 @@ image. Download the Docker image ------------------------- -1. Use the following command to pull the Docker image from Docker Hub. +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml - .. tab-set:: + {% set dockers = data.dockers %} + 1. Use the following command to pull the Docker image from Docker Hub. - .. tab-item:: Ubuntu 24.04 + Python 3.12 - :sync: py312 + {% if dockers|length > 1 %} + .. tab-set:: - .. code-block:: shell + {% for docker in data.dockers %} + .. tab-item:: {{ docker.doc_name }} + :sync: {{ docker.pull_tag }} - docker pull rocm/megatron-lm:v25.5_py312 + .. code-block:: shell - .. tab-item:: Ubuntu 22.04 + Python 3.10 - :sync: py310 + docker pull {{ docker.pull_tag }} - .. code-block:: shell + {% endfor %} + {% elif dockers|length == 1 %} + {% set docker = dockers[0] %} + .. code-block:: shell - docker pull rocm/megatron-lm:v25.5_py310 + docker pull {{ docker.pull_tag }} -2. Launch the Docker container. + {% endif %} + 2. Launch the Docker container. - .. tab-set:: + {% if dockers|length > 1 %} + .. tab-set:: - .. tab-item:: Ubuntu 24.04 + Python 3.12 - :sync: py312 + {% for docker in data.dockers %} + .. tab-item:: {{ docker.doc_name }} + :sync: {{ docker.pull_tag }} - .. code-block:: shell + .. code-block:: shell - docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312 + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} + {% endfor %} + {% elif dockers|length == 1 %} + {% set docker = dockers[0] %} + .. code-block:: shell - .. tab-item:: Ubuntu 22.04 + Python 3.10 - :sync: py310 + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --network host --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + --name megatron_training_env \ + {{ docker.pull_tag }} - .. code-block:: shell - - docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310 + {% endif %} 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. @@ -348,6 +376,22 @@ If the tokenizer is not found, it'll be downloaded if publicly available. TOKENIZER_MODEL=tokenizer/tokenizer.model +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-7B" + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="Qwen/Qwen2.5-72B" + Dataset options --------------- @@ -373,7 +417,7 @@ You can use either mock data or real data for training. Download the dataset ^^^^^^^^^^^^^^^^^^^^ -.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy For Llama models, use the `prepare_dataset.sh `_ script @@ -412,8 +456,8 @@ Download the dataset wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json - wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin - wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. @@ -437,8 +481,8 @@ Download the dataset wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json - wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin - wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx + cd .. + bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3 To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. @@ -448,8 +492,6 @@ Download the dataset DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored - Ensure that the files are accessible inside the Docker container. - .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy If you don't already have the dataset, download the Mixtral dataset using the following @@ -472,6 +514,27 @@ Download the dataset Ensure that the files are accessible inside the Docker container. +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir -p temp/qwen-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/qwen-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + Multi-node configuration ------------------------ @@ -512,27 +575,17 @@ also be passed as command line arguments. Refer to the following example configu # Specify which RDMA interfaces to use for communication export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 -Getting started -=============== - -The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate -system performance, conduct training benchmarks, and achieve superior -performance for models like Llama, DeepSeek, and Mixtral. This container should not be -expected to provide generalized performance across all training workloads. You -can expect the container to perform in the model configurations described in -the following section, but other configurations are not validated by AMD. - .. _amd-megatron-lm-run-training: Run training ------------- +============ Use the following example commands to set up the environment, configure :ref:`key options `, and run training on MI300X series accelerators with the AMD Megatron-LM environment. Single node training -^^^^^^^^^^^^^^^^^^^^ +-------------------- .. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b @@ -541,7 +594,20 @@ Single node training .. code-block:: shell - TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \ + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MBS=2 \ + BS=16 \ + TE_FP8=0 \ + TP=1 \ + PP=1 \ + FSDP=1 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh .. note:: @@ -550,8 +616,6 @@ Single node training parallelism, MCore's distributed optimizer, gradient accumulation fusion, or FP16. - Currently, FSDP is only compatible with BF16 precision. - .. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the @@ -559,13 +623,29 @@ Single node training .. code-block:: shell - TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh For Llama 3.1 8B BF16, use the following command: .. code-block:: shell - TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + TEE_OUTPUT=1 \ + MBS=2 \ + BS=128 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh .. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b @@ -574,7 +654,18 @@ Single node training .. code-block:: shell - TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama3.sh .. note:: @@ -583,7 +674,36 @@ Single node training parallelism, MCore's distributed optimizer, gradient accumulation fusion, or FP16. - Currently, FSDP is only compatible with BF16 precision. +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy + + To run the training on a single node for Llama 3.1 70B with proxy, use the following command. + + .. code-block:: shell + + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + RECOMPUTE=1 \ + MBS=3 \ + BS=24 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=70 \ + FSDP=1 \ + TOTAL_ITERS=10 \ + NUM_LAYERS=40 \ + bash examples/llama/train_llama3.sh + + .. note:: + + Use two or more nodes to run the *full* Llama 70B model with FP8 precision. + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. .. container:: model-doc pyt_megatron_lm_train_llama-2-7b @@ -592,13 +712,29 @@ Single node training .. code-block:: shell - TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh For Llama 2 7B BF16, use the following command: .. code-block:: shell - TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + TEE_OUTPUT=1 \ + MBS=4 \ + BS=256 \ + TP=1 \ + TE_FP8=0 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=7 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh .. container:: model-doc pyt_megatron_lm_train_llama-2-70b @@ -607,7 +743,18 @@ Single node training .. code-block:: shell - TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + CKPT_FORMAT=torch_dist \ + TEE_OUTPUT=1 \ + MBS=7 \ + BS=56 \ + TP=1 \ + TE_FP8=0 \ + FSDP=1 \ + RECOMPUTE=1 \ + SEQ_LENGTH=4096 \ + MODEL_SIZE=70 \ + TOTAL_ITERS=50 \ + bash examples/llama/train_llama2.sh .. note:: @@ -616,8 +763,6 @@ Single node training parallelism, MCore's distributed optimizer, gradient accumulation fusion, or FP16. - Currently, FSDP is only compatible with BF16 precision. - .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, @@ -625,7 +770,8 @@ Single node training .. code-block:: shell - FORCE_BANLANCE=true \ + export NVTE_FUSED_ATTN_CK=0 + FORCE_BALANCE=true \ RUN_ENV=cluster \ MODEL_SIZE=671B \ TRAIN_ITERS=50 \ @@ -647,7 +793,15 @@ Single node training .. code-block:: shell - GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh + export NVTE_FUSED_ATTN_CK=0 + GEMM_TUNING=1 \ + PR=bf16 \ + MBS=4 \ + AC=none \ + SEQ_LEN=4096 \ + PAD_LEN=4096 \ + TRAIN_ITERS=50 \ + bash examples/deepseek_v2/train_deepseekv2.sh .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b @@ -656,7 +810,24 @@ Single node training .. code-block:: shell - RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=0 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=none \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=4096 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x7B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh .. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy @@ -665,10 +836,85 @@ Single node training .. code-block:: shell - RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh + TOKENIZER_MODEL= + RECOMPUTE_NUM_LAYERS=4 \ + TEE_OUTPUT=1 \ + MBS=1 \ + GBS=16 \ + TP_SIZE=1 \ + PP_SIZE=1 \ + AC=full \ + NUM_LAYERS=4 \ + PR=bf16 \ + EP_SIZE=8 \ + ETP_SIZE=1 \ + SEQLEN=8192 \ + FORCE_BALANCE=true \ + MOCK_DATA=1 \ + RUN_ENV=cluster \ + MODEL_SIZE=8x22B \ + TRAIN_ITERS=50 \ + bash examples/mixtral/train_mixtral_moe.sh -Multi-node training -^^^^^^^^^^^^^^^^^^^ +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b + + To run training on a single node for Qwen 2.5 7B BF16, use the following + command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=0 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + + For FP8, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + TP=1 \ + CP=1 \ + PP=1 \ + MBS=10 \ + BS=640 \ + TE_FP8=1 \ + MODEL_SIZE=7 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-7B + +.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b + + To run the training on a single node for Qwen 2.5 72B BF16, use the following command. + + .. code-block:: shell + + bash examples/qwen/train_qwen2.sh \ + FSDP=1 \ + CP=1 \ + PP=1 \ + MBS=3 \ + BS=24 \ + TE_FP8=0 \ + MODEL_SIZE=72 \ + SEQ_LENGTH=2048 \ + TOTAL_ITERS=50 \ + MOCK_DATA=1 \ + TOKENIZER_MODEL=Qwen/Qwen2.5-72B \ + RECOMPUTE_ACTIVATIONS=full \ + CKPT_FORMAT=torch_dist + +Multi-node training examples +---------------------------- To run training on multiple nodes, launch the Docker container on each node. For example, for Llama 3 using a two node setup (``NODE0`` as the master node), @@ -678,13 +924,33 @@ use these commands. .. code-block:: shell - TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=0 \ + bash examples/llama/train_llama3.sh * On the worker node ``NODE1``: .. code-block:: shell - TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh + TEE_OUTPUT=1 \ + MBS=2 \ + BS=256 \ + TP=1 \ + TE_FP8=1 \ + SEQ_LENGTH=8192 \ + MODEL_SIZE=8 \ + MASTER_ADDR=IP_NODE0 \ + NNODES=2 \ + NODE_RANK=1 \ + bash examples/llama/train_llama3.sh Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is provided in diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst index 95af83a27..b67d1ac3a 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst @@ -12,23 +12,23 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub ` * `Docker Hub `__ * - 25.4 - - 6.3.0 - - 0.4.31 + - + * ROCm 6.3.0 + * JAX 0.4.31 - * :doc:`Documentation ` * `Docker Hub `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst index 291d32157..9dd1c8f2c 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst @@ -11,37 +11,49 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub ` - * `Docker Hub `__ + * `Docker Hub (py312) `__ + * `Docker Hub (py310) `__ + + * - v25.5 + - + * ROCm 6.3.4 + * PyTorch 2.8.0a0+gite2f9759 + - + * :doc:`Documentation ` + * `Docker Hub (py312) `__ + * `Docker Hub (py310) `__ * - v25.4 - - 6.3.0 - - 2.7.0a0+git637433 + - + * ROCm 6.3.0 + * PyTorch 2.7.0a0+git637433 - * :doc:`Documentation ` * `Docker Hub `__ * - v25.3 - - 6.3.0 - - 2.7.0a0+git637433 + - + * ROCm 6.3.0 + * PyTorch 2.7.0a0+git637433 - * :doc:`Documentation ` * `Docker Hub `__ * - v24.12-dev - - 6.1.0 - - 2.4.0 + - + * ROCm 6.1.0 + * PyTorch 2.4.0 - * :doc:`Documentation ` * `Docker Hub `__ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst new file mode 100644 index 000000000..f2a81b7c6 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst @@ -0,0 +1,775 @@ +:orphan: + +.. meta:: + :description: How to train a model using Megatron-LM for ROCm. + :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch + +****************************************** +Training a model with Megatron-LM for ROCm +****************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm Megatron-LM + training performance documentation. See :doc:`../megatron-lm` for the latest version. + +The `Megatron-LM framework for ROCm `_ is +a specialized fork of the robust Megatron-LM, designed to enable efficient +training of large-scale language models on AMD GPUs. By leveraging AMD +Instinctâ„¢ MI300X series accelerators, Megatron-LM delivers enhanced +scalability, performance, and resource utilization for AI workloads. It is +purpose-built to support models like Llama, DeepSeek, and Mixtral, +enabling developers to train next-generation AI models more +efficiently. + +AMD provides a ready-to-use Docker image for MI300X series accelerators containing +essential components, including PyTorch, ROCm libraries, and Megatron-LM +utilities. It contains the following software components to accelerate training +workloads: + ++--------------------------+--------------------------------+ +| Software component | Version | ++==========================+================================+ +| ROCm | 6.3.4 | ++--------------------------+--------------------------------+ +| PyTorch | 2.8.0a0+gite2f9759 | ++--------------------------+--------------------------------+ +| Python | 3.12 or 3.10 | ++--------------------------+--------------------------------+ +| Transformer Engine | 1.13.0+bb061ade | ++--------------------------+--------------------------------+ +| Flash Attention | 3.0.0 | ++--------------------------+--------------------------------+ +| hipBLASLt | 0.13.0-4f18bf6 | ++--------------------------+--------------------------------+ +| Triton | 3.3.0 | ++--------------------------+--------------------------------+ +| RCCL | 2.22.3 | ++--------------------------+--------------------------------+ + +Megatron-LM provides the following key features to train large language models efficiently: + +- Transformer Engine (TE) + +- APEX + +- GEMM tuning + +- Torch.compile + +- 3D parallelism: TP + SP + CP + +- Distributed optimizer + +- Flash Attention (FA) 3 + +- Fused kernels + +- Pre-training + +.. _amd-megatron-lm-model-support-v255: + +The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml + + Supported models + ================ + + The following models are supported for training performance benchmarking with Megatron-LM and ROCm. + Some instructions, commands, and training recommendations in this documentation might + vary by model -- select one to get started. + + {% set model_groups = data["megatron-lm_benchmark"].model_groups %} + + .. raw:: html + +
+
+
Model
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model variant
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ +.. note:: + + Some models, such as Llama, require an external license agreement through + a third party (for example, Meta). + +.. _amd-megatron-lm-performance-measurements-v255: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `__ +page provides reference throughput and latency measurements for training +popular AI models. + +.. important:: + + The performance data presented in + `Performance results with AMD ROCm software `__ + only reflects the latest version of this training benchmarking environment. + The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software. + +System validation +================= + +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. + +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. + +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. + +.. _mi300x-amd-megatron-lm-training-v255: + +Environment setup +================= + +Use the following instructions to set up the environment, configure the script to train models, and +reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker +image. + +.. _amd-megatron-lm-requirements-v255: + +Download the Docker image +------------------------- + +1. Use the following command to pull the Docker image from Docker Hub. + + .. tab-set:: + + .. tab-item:: Ubuntu 24.04 + Python 3.12 + :sync: py312 + + .. code-block:: shell + + docker pull rocm/megatron-lm:v25.5_py312 + + .. tab-item:: Ubuntu 22.04 + Python 3.10 + :sync: py310 + + .. code-block:: shell + + docker pull rocm/megatron-lm:v25.5_py310 + +2. Launch the Docker container. + + .. tab-set:: + + .. tab-item:: Ubuntu 24.04 + Python 3.12 + :sync: py312 + + .. code-block:: shell + + docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312 + + + .. tab-item:: Ubuntu 22.04 + Python 3.10 + :sync: py310 + + .. code-block:: shell + + docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310 + +3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it. + + .. code-block:: shell + + docker start megatron_training_env + docker exec -it megatron_training_env bash + +The Docker container includes a pre-installed, verified version of the ROCm +Megatron-LM development branch +``__, including necessary +training scripts. + +.. _amd-megatron-lm-environment-setup-v255: + +Configuration +============= + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b + + Update the ``train_llama3.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + Update the ``train_llama2.sh`` configuration script in the ``examples/llama`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral`` + directory of + ``__ to configure your training run. + Options can also be passed as command line arguments as described in :ref:`Run training `. + +.. note:: + + See :ref:`Key options ` for more information on configuration options. + +Network interface +----------------- + +Update the network interface in the script to match your system's network interface. To +find your network interface, run the following (outside of any Docker container): + +.. code-block:: bash + + ip a + +Look for an active interface that has an IP address in the same subnet as +your other nodes. Then, update the following variables in the script, for +example: + +.. code-block:: bash + + export NCCL_SOCKET_IFNAME=ens50f0np0 + + export GLOO_SOCKET_IFNAME=ens50f0np0 + +.. _amd-megatron-lm-tokenizer-v255: + +Tokenizer +--------- + +You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples. +If the tokenizer is not found, it'll be downloaded if publicly available. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + If you do not have Llama 3.3 tokenizer locally, you need to use your + personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer. + See `Llama-3.3-70B-Instruct + `_. After you are + authorized, use your ``HF_TOKEN`` to download the tokenizer and set the + variable ``TOKENIZER_MODEL`` to the tokenizer path. + + .. code-block:: shell + + export HF_TOKEN= + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-8B" + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="meta-llama/Llama-3.1-70B" + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3" + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite" + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + Download the Mixtral tokenizer. + + .. code-block:: shell + + mkdir tokenizer + cd tokenizer + export HF_TOKEN= + wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model + + Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path. + + .. code-block:: shell + + TOKENIZER_MODEL=tokenizer/tokenizer.model + +Dataset options +--------------- + +You can use either mock data or real data for training. + +* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default + value is ``1`` for enabled. + + .. code-block:: bash + + MOCK_DATA=1 + +* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 + + DATA_PATH="/data/bookcorpus_text_sentence" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Download the dataset +^^^^^^^^^^^^^^^^^^^^ + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b + + For Llama models, use the `prepare_dataset.sh + `_ script + to prepare your dataset. + To download the dataset, set the ``DATASET`` variable to the dataset you'd + like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and + ``DATASET=bookcorpus``. + + .. code-block:: shell + + DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset + DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset + + ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer. + Remember to either pre-download the tokenizer or setup Hugging Face access + otherwise when needed -- see the :ref:`Tokenizer ` section. + + .. note:: + + When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx`` + as in the following example: + + .. code-block:: shell + + DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + If you don't already have the dataset, download the DeepSeek dataset using the following + commands: + + .. code-block:: shell + + mkdir deepseek-datasets + cd deepseek-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/deepseek-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy + + If you don't already have the dataset, download the Mixtral dataset using the following + commands: + + .. code-block:: shell + + mkdir mixtral-datasets + cd mixtral-datasets + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin + wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx + + To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset. + + .. code-block:: bash + + MOCK_DATA=0 # Train on real data + + DATA_DIR="/mixtral-datasets" # Change to where your dataset is stored + + Ensure that the files are accessible inside the Docker container. + +Multi-node configuration +------------------------ + +If you're running multi-node training, update the following environment variables. They can +also be passed as command line arguments. Refer to the following example configurations. + +* Change ``localhost`` to the master node's hostname: + + .. code-block:: shell + + MASTER_ADDR="${MASTER_ADDR:-localhost}" + +* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``): + + .. code-block:: shell + + NNODES="${NNODES:-1}" + +* Set the rank of each node (0 for master, 1 for the first worker node, and so on): + + .. code-block:: shell + + NODE_RANK="${NODE_RANK:-0}" + +* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an + NFS directory) for multi-node runs: + + .. code-block:: shell + + DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs + +* For multi-node runs, make sure the correct network drivers are installed on the nodes. If + inside a Docker container, either install the drivers inside the Docker container or pass the network + drivers from the host while creating the Docker container. + + .. code-block:: shell + + # Specify which RDMA interfaces to use for communication + export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + +Getting started +=============== + +The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate +system performance, conduct training benchmarks, and achieve superior +performance for models like Llama, DeepSeek, and Mixtral. This container should not be +expected to provide generalized performance across all training workloads. You +can expect the container to perform in the model configurations described in +the following section, but other configurations are not validated by AMD. + +.. _amd-megatron-lm-run-training-v255: + +Run training +------------ + +Use the following example commands to set up the environment, configure +:ref:`key options `, and run training on +MI300X series accelerators with the AMD Megatron-LM environment. + +Single node training +^^^^^^^^^^^^^^^^^^^^ + +.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b + + To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + + Currently, FSDP is only compatible with BF16 precision. + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b + + To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + + For Llama 3.1 8B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b + + To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + + Currently, FSDP is only compatible with BF16 precision. + +.. container:: model-doc pyt_megatron_lm_train_llama-2-7b + + To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the + following command. + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + + For Llama 2 7B BF16, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + +.. container:: model-doc pyt_megatron_lm_train_llama-2-70b + + To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument. + For example, use the following command: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh + + .. note:: + + It is suggested to use ``TP=1`` when FSDP is enabled for higher + throughput. FSDP-v2 is not supported with pipeline parallelism, expert + parallelism, MCore's distributed optimizer, gradient accumulation fusion, + or FP16. + + Currently, FSDP is only compatible with BF16 precision. + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy + + To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + FORCE_BANLANCE=true \ + RUN_ENV=cluster \ + MODEL_SIZE=671B \ + TRAIN_ITERS=50 \ + SEQ_LEN=4096 \ + NUM_LAYERS=3 \ + MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \ + PR=bf16 \ + TP=1 PP=1 ETP=1 EP=8 \ + GEMM_TUNING=1 \ + NVTE_CK_USES_BWD_V3=1 \ + USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \ + GPT_LAYER_IN_TE=true \ + bash examples/deepseek_v3/train_deepseekv3.sh + +.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b + + To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel), + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh + +.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy + + To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy, + navigate to the Megatron-LM folder and use the following command. + + .. code-block:: shell + + RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh + +Multi-node training +^^^^^^^^^^^^^^^^^^^ + +To run training on multiple nodes, launch the Docker container on each node. +For example, for Llama 3 using a two node setup (``NODE0`` as the master node), +use these commands. + +* On the master node ``NODE0``: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh + +* On the worker node ``NODE1``: + + .. code-block:: shell + + TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh + +Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is +provided in +``__ to +enable training at scale under a SLURM environment. For example, to run +training on 16 nodes, try the following command: + +.. code-block:: shell + + sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh + +.. _amd-megatron-lm-benchmark-test-vars-v255: + +Key options +----------- + +The benchmark tests support the following sets of variables. + +``TEE_OUTPUT`` + ``1`` to enable training logs or ``0`` to disable. + +``TE_FP8`` + ``0`` for B16 or ``1`` for FP8 -- ``0`` by default. + +``GEMM_TUNING`` + ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels. + +``USE_FLASH_ATTN`` + ``1`` to enable Flash Attention. + +``FSDP`` + ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``, + ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled. + +``ENABLE_PROFILING`` + ``1`` to enable PyTorch profiling for performance analysis. + +``transformer-impl`` + ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE. + +``MODEL_SIZE`` + ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example. + +``TOTAL_ITERS`` + The total number of iterations -- ``10`` by default. + +``MOCK_DATA`` + ``1`` to use mock data or ``0`` to use real data you provide. + +``MBS`` + Micro batch size. + +``BS`` + Global batch size. + +``TP`` / ``TP_SIZE`` + Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on. + +``EP`` / ``EP_SIZE`` + Expert parallel for MoE models. + +``SEQ_LENGTH`` + Input sequence length. + +``PR`` + Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs. + +``AC`` + Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default. + +``NUM_LAYERS`` + Use reduced number of layers as a proxy model. + +``RECOMPUTE_NUM_LAYERS`` + Number of layers used for checkpointing recompute. + +Previous versions +================= + +See :doc:`megatron-lm-history` to find documentation for previous releases +of the ``ROCm/megatron-lm`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst index afeecaf01..1535f1d43 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst @@ -11,37 +11,39 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub < .. list-table:: :header-rows: 1 - :stub-columns: 1 * - Image version - - ROCm version - - PyTorch version + - Components - Resources * - v25.6 - - 6.3.4 - - 2.8.0a0+git7d205b2 + - + * ROCm 6.3.4 + * PyTorch 2.8.0a0+git7d205b2 - * :doc:`Documentation <../pytorch-training>` * `Docker Hub `__ * - v25.5 - - 6.3.4 - - 2.7.0a0+git637433 + - + * ROCm 6.3.4 + * PyTorch 2.7.0a0+git637433 - * :doc:`Documentation ` * `Docker Hub `__ * - v25.4 - - 6.3.0 - - 2.7.0a0+git637433 + - + * ROCm 6.3.0 + * PyTorch 2.7.0a0+git637433 - * :doc:`Documentation ` * `Docker Hub `__ * - v25.3 - - 6.3.0 - - 2.7.0a0+git637433 + - + * ROCm 6.3.0 + * PyTorch 2.7.0a0+git637433 - * :doc:`Documentation ` * `Docker Hub `__