diff --git a/.wordlist.txt b/.wordlist.txt index 09236fa95..4eb5df599 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -918,6 +918,7 @@ toolchain toolchains toolset toolsets +torchtitan torchvision tqdm tracebacks diff --git a/docs/conf.py b/docs/conf.py index 6f3979312..6e7fa5e61 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -124,11 +124,15 @@ article_pages = [ {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]}, + {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]}, {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]}, diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml new file mode 100644 index 000000000..df0a198d5 --- /dev/null +++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml @@ -0,0 +1,120 @@ +unified_docker: + latest: + pull_tag: rocm/pytorch-training:v25.6 + docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags + rocm_version: 6.4.1 + pytorch_version: 2.8.0a0+git7d205b2 + python_version: 3.10.17 + transformer_engine_version: 1.14.0+2f85f5f2 + flash_attention_version: 3.0.0.post1 + hipblaslt_version: 0.15.0-8c6919d + triton_version: 3.3.0 +model_groups: + - group: Pre-training + tag: pre-training + models: + - model: Llama 3.1 8B + mad_tag: pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + training_modes: [pretrain] + - model: Llama 3.1 70B + mad_tag: pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct + precision: BF16 + training_modes: [pretrain] + - model: FLUX.1-dev + mad_tag: pyt_train_flux + model_repo: Flux + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + precision: BF16 + training_modes: [pretrain] + - group: Fine-tuning + tag: fine-tuning + models: + - model: Llama 4 Scout 17B-16E + mad_tag: pyt_train_llama-4-scout-17b-16e + model_repo: Llama-4-17B_16E + url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.3 70B + mad_tag: pyt_train_llama-3.3-70b + model_repo: Llama-3.3-70B + url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 3.2 1B + mad_tag: pyt_train_llama-3.2-1b + model_repo: Llama-3.2-1B + url: https://huggingface.co/meta-llama/Llama-3.2-1B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 3B + mad_tag: pyt_train_llama-3.2-3b + model_repo: Llama-3.2-3B + url: https://huggingface.co/meta-llama/Llama-3.2-3B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.2 Vision 11B + mad_tag: pyt_train_llama-3.2-vision-11b + model_repo: Llama-3.2-Vision-11B + url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.2 Vision 90B + mad_tag: pyt_train_llama-3.2-vision-90b + model_repo: Llama-3.2-Vision-90B + url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision + precision: BF16 + training_modes: [finetune_fw] + - model: Llama 3.1 8B + mad_tag: pyt_train_llama-3.1-8b + model_repo: Llama-3.1-8B + url: https://huggingface.co/meta-llama/Llama-3.1-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3.1 70B + mad_tag: pyt_train_llama-3.1-70b + model_repo: Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 3.1 405B + mad_tag: pyt_train_llama-3.1-405b + model_repo: Llama-3.1-405B + url: https://huggingface.co/meta-llama/Llama-3.1-405B + precision: BF16 + training_modes: [finetune_qlora, HF_finetune_lora] + - model: Llama 3 8B + mad_tag: pyt_train_llama-3-8b + model_repo: Llama-3-8B + url: https://huggingface.co/meta-llama/Meta-Llama-3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 3 70B + mad_tag: pyt_train_llama-3-70b + model_repo: Llama-3-70B + url: https://huggingface.co/meta-llama/Meta-Llama-3-70B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 7B + mad_tag: pyt_train_llama-2-7b + model_repo: Llama-2-7B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora, finetune_qlora] + - model: Llama 2 13B + mad_tag: pyt_train_llama-2-13b + model_repo: Llama-2-13B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Llama 2 70B + mad_tag: pyt_train_llama-2-70b + model_repo: Llama-2-70B + url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 + precision: BF16 + training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora] diff --git a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml index df0a198d5..dc19843be 100644 --- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml +++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml @@ -1,38 +1,17 @@ -unified_docker: - latest: - pull_tag: rocm/pytorch-training:v25.6 - docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags - rocm_version: 6.4.1 - pytorch_version: 2.8.0a0+git7d205b2 - python_version: 3.10.17 - transformer_engine_version: 1.14.0+2f85f5f2 - flash_attention_version: 3.0.0.post1 - hipblaslt_version: 0.15.0-8c6919d - triton_version: 3.3.0 +dockers: + - pull_tag: rocm/pytorch-training:v25.7 + docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712 + components: + ROCm: 6.4.2 + PyTorch: 2.8.0a0+gitd06a406 + Python: 3.10.18 + Transformer Engine: 2.2.0.dev0+94e53dd8 + Flash Attention: 3.0.0.post1 + hipBLASLt: 1.1.0-4b9a52edfc + Triton: 3.3.0 model_groups: - - group: Pre-training - tag: pre-training - models: - - model: Llama 3.1 8B - mad_tag: pyt_train_llama-3.1-8b - model_repo: Llama-3.1-8B - url: https://huggingface.co/meta-llama/Llama-3.1-8B - precision: BF16 - training_modes: [pretrain] - - model: Llama 3.1 70B - mad_tag: pyt_train_llama-3.1-70b - model_repo: Llama-3.1-70B - url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct - precision: BF16 - training_modes: [pretrain] - - model: FLUX.1-dev - mad_tag: pyt_train_flux - model_repo: Flux - url: https://huggingface.co/black-forest-labs/FLUX.1-dev - precision: BF16 - training_modes: [pretrain] - - group: Fine-tuning - tag: fine-tuning + - group: Meta Llama + tag: llama models: - model: Llama 4 Scout 17B-16E mad_tag: pyt_train_llama-4-scout-17b-16e @@ -75,19 +54,19 @@ model_groups: model_repo: Llama-3.1-8B url: https://huggingface.co/meta-llama/Llama-3.1-8B precision: BF16 - training_modes: [finetune_fw, finetune_lora] + training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain] - model: Llama 3.1 70B mad_tag: pyt_train_llama-3.1-70b model_repo: Llama-3.1-70B - url: https://huggingface.co/meta-llama/Llama-3.1-70B + url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct precision: BF16 - training_modes: [finetune_fw, finetune_lora, finetune_qlora] + training_modes: [pretrain, finetune_fw, finetune_lora] - model: Llama 3.1 405B mad_tag: pyt_train_llama-3.1-405b model_repo: Llama-3.1-405B url: https://huggingface.co/meta-llama/Llama-3.1-405B precision: BF16 - training_modes: [finetune_qlora, HF_finetune_lora] + training_modes: [finetune_qlora] - model: Llama 3 8B mad_tag: pyt_train_llama-3-8b model_repo: Llama-3-8B @@ -117,4 +96,67 @@ model_groups: model_repo: Llama-2-70B url: https://github.com/meta-llama/llama-models/tree/main/models/llama2 precision: BF16 - training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora] + training_modes: [finetune_lora, finetune_qlora] + - group: OpenAI + tag: openai + models: + - model: GPT OSS 20B + mad_tag: pyt_train_gpt_oss_20b + model_repo: GPT-OSS-20B + url: https://huggingface.co/openai/gpt-oss-20b + precision: BF16 + training_modes: [HF_finetune_lora] + - model: GPT OSS 120B + mad_tag: pyt_train_gpt_oss_120b + model_repo: GPT-OSS-120B + url: https://huggingface.co/openai/gpt-oss-120b + precision: BF16 + training_modes: [HF_finetune_lora] + - group: Qwen + tag: qwen + models: + - model: Qwen 3 8B + mad_tag: pyt_train_qwen3-8b + model_repo: Qwen3-8B + url: https://huggingface.co/Qwen/Qwen3-8B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 3 32B + mad_tag: pyt_train_qwen3-32b + model_repo: Qwen3-32 + url: https://huggingface.co/Qwen/Qwen3-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 32B + mad_tag: pyt_train_qwen2.5-32b + model_repo: Qwen2.5-32B + url: https://huggingface.co/Qwen/Qwen2.5-32B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2.5 72B + mad_tag: pyt_train_qwen2.5-72b + model_repo: Qwen2.5-72B + url: https://huggingface.co/Qwen/Qwen2.5-72B + precision: BF16 + training_modes: [finetune_lora] + - model: Qwen 2 1.5B + mad_tag: pyt_train_qwen2-1.5b + model_repo: Qwen2-1.5B + url: https://huggingface.co/Qwen/Qwen2-1.5B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - model: Qwen 2 7B + mad_tag: pyt_train_qwen2-7b + model_repo: Qwen2-7B + url: https://huggingface.co/Qwen/Qwen2-7B + precision: BF16 + training_modes: [finetune_fw, finetune_lora] + - group: Flux + tag: flux + models: + - model: FLUX.1-dev + mad_tag: pyt_train_flux + model_repo: Flux + url: https://huggingface.co/black-forest-labs/FLUX.1-dev + precision: BF16 + training_modes: [pretrain] diff --git a/docs/how-to/deep-learning-rocm.rst b/docs/how-to/deep-learning-rocm.rst index 16dad363c..fb1d55a3c 100644 --- a/docs/how-to/deep-learning-rocm.rst +++ b/docs/how-to/deep-learning-rocm.rst @@ -23,93 +23,92 @@ The table below summarizes information about ROCm-enabled deep learning framewor - Installation options - GitHub - * - `PyTorch `_ + * - `PyTorch `__ - .. raw:: html - + - - - `Docker image `_ - - `Wheels package `_ - - `ROCm Base Docker image `_ - - `Upstream Docker file `_ + - `Docker image `__ + - `Wheels package `__ + - `ROCm Base Docker image `__ + - `Upstream Docker file `__ - .. raw:: html - + - - * - `TensorFlow `_ + + * - `TensorFlow `__ - .. raw:: html - + - - - `Docker image `_ - - `Wheels package `_ + - `Docker image `__ + - `Wheels package `__ - .. raw:: html - + - * - `JAX `_ + * - `JAX `__ - .. raw:: html - + - - - `Docker image `_ + - `Docker image `__ - .. raw:: html - + - - * - `verl `_ + + * - `verl `__ - .. raw:: html - + - - - `Docker image `_ + - `Docker image `__ - .. raw:: html - + - * - `Stanford Megatron-LM `_ + * - `Stanford Megatron-LM `__ - .. raw:: html - + - - - `Docker image `_ + - `Docker image `__ - .. raw:: html - + - - * - `DGL `_ + + * - `DGL `__ - .. raw:: html - + - - - `Docker image `_ + - `Docker image `__ - .. raw:: html - + - * - `Megablocks `_ + * - `Megablocks `__ - .. raw:: html - + - - - `Docker image `_ + - `Docker image `__ - .. raw:: html - + - - * - `Taichi `_ + + * - `Taichi `__ - .. raw:: html - + - - - `Docker image `_ - - `Wheels package `_ + - `Docker image `__ + - `Wheels package `__ - .. raw:: html - - + Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization through the following guides. @@ -124,10 +123,3 @@ through the following guides. * :doc:`Use ROCm for AI inference optimization ` - - - - - - - diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst index 9e0f4443a..34df0359d 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst @@ -46,7 +46,7 @@ vLLM inference performance testing - {{ unified_docker.hipblaslt_version }} With this Docker image, you can quickly test the :ref:`expected -inference performance numbers ` for +inference performance numbers ` for MI300X series accelerators. What's new @@ -219,7 +219,7 @@ system's configuration. ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``. - Although the :ref:`available models ` are preconfigured + Although the :ref:`available models ` are preconfigured to collect latency and throughput performance data, you can also change the benchmarking parameters. See the standalone benchmarking tab for more information. diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst index 02c992620..9f3bd608d 100644 --- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst +++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst @@ -39,7 +39,7 @@ vLLM inference performance testing - {{ unified_docker.hipblaslt_version }} With this Docker image, you can quickly test the :ref:`expected -inference performance numbers ` for +inference performance numbers ` for MI300X series accelerators. What's new @@ -208,7 +208,7 @@ system's configuration. and ``{{ model.mad_tag }}_serving.csv``. Although the :ref:`available models - ` are preconfigured to collect + ` are preconfigured to collect offline throughput and online serving performance data, you can also change the benchmarking parameters. See the standalone benchmarking tab for more information. diff --git a/docs/how-to/rocm-for-ai/install.rst b/docs/how-to/rocm-for-ai/install.rst index 6847d06b4..cb949cb31 100644 --- a/docs/how-to/rocm-for-ai/install.rst +++ b/docs/how-to/rocm-for-ai/install.rst @@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L `. If you’re using a Radeon GPU for graphics-accelerated applications, refer to the -`Radeon installation instructions `_. +`Radeon installation instructions `_. -You can install ROCm on :ref:`compatible systems ` via your Linux +You can install ROCm on :doc:`compatible systems ` via your Linux distribution's package manager. See the following documentation resources to get started: * :doc:`ROCm installation overview ` diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst index a9d99378e..c18b1dfea 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst @@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI -workloads. It is purpose-built to :ref:`support models ` +workloads. It is purpose-built to :ref:`support models ` like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater efficiency. See the GitHub repository at ``__. @@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e - Pre-training -.. _amd-megatron-lm-model-support: +.. _amd-megatron-lm-model-support-24-12: The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst index 3a2f23322..e039aff8a 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst @@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e - Pre-training -.. _amd-megatron-lm-model-support: +.. _amd-megatron-lm-model-support-25-3: The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator. @@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t .. tab-item:: Llama :sync: llama - To train any of the Llama 2 models that :ref:`this Docker image supports `, use the ``Llama2Tokenizer``. + To train any of the Llama 2 models that :ref:`this Docker image supports `, use the ``Llama2Tokenizer``. To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``. Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable. @@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t .. tab-item:: DeepSeek V2 :sync: deepseek - To train any of the DeepSeek V2 models that :ref:`this Docker image supports `, use the ``DeepSeekV2Tokenizer``. + To train any of the DeepSeek V2 models that :ref:`this Docker image supports `, use the ``DeepSeekV2Tokenizer``. Multi-node training ^^^^^^^^^^^^^^^^^^^ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst index 76e5eb716..9d7c7ecd6 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst @@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e - Pre-training -.. _amd-megatron-lm-model-support: +.. _amd-megatron-lm-model-support-25-4: The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators. @@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``. .. tab-item:: Llama :sync: llama - To train any of the Llama 2 models that :ref:`this Docker image supports `, use the ``Llama2Tokenizer`` + To train any of the Llama 2 models that :ref:`this Docker image supports `, use the ``Llama2Tokenizer`` or the default ``HuggingFaceTokenizer``. To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``. @@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``. .. tab-item:: DeepSeek V2 :sync: deepseek - To train any of the DeepSeek V2 models that :ref:`this Docker image supports `, use the ``DeepSeekV2Tokenizer``. + To train any of the DeepSeek V2 models that :ref:`this Docker image supports `, use the ``DeepSeekV2Tokenizer``. Multi-node training ^^^^^^^^^^^^^^^^^^^ diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst index 1535f1d43..07d640159 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst @@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub < - Components - Resources + * - v25.7 + - + * ROCm 6.4.2 + * PyTorch 2.8.0a0+gitd06a406 + - + * :doc:`Documentation <../pytorch-training>` + * `Docker Hub `__ + * - v25.6 - * ROCm 6.3.4 * PyTorch 2.8.0a0+git7d205b2 - - * :doc:`Documentation <../pytorch-training>` + * :doc:`Documentation ` * `Docker Hub `__ * - v25.5 diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst index a43297657..e68a1092b 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst @@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking: ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst new file mode 100644 index 000000000..f9bc57a43 --- /dev/null +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst @@ -0,0 +1,456 @@ +:orphan: + +.. meta:: + :description: How to train a model using PyTorch for ROCm. + :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker + +************************************** +Training a model with PyTorch for ROCm +************************************** + +.. caution:: + + This documentation does not reflect the latest version of ROCm vLLM + performance benchmark documentation. See :doc:`../pytorch-training` for the latest version. + +PyTorch is an open-source machine learning framework that is widely used for +model training with GPU-optimized components for transformer-based models. + +The `PyTorch for ROCm training Docker `_ +(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a +model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate +training workloads: + ++--------------------------+--------------------------------+ +| Software component | Version | ++==========================+================================+ +| ROCm | 6.3.4 | ++--------------------------+--------------------------------+ +| PyTorch | 2.8.0a0+git7d205b2 | ++--------------------------+--------------------------------+ +| Python | 3.10.17 | ++--------------------------+--------------------------------+ +| Transformer Engine | 1.14.0+2f85f5f2 | ++--------------------------+--------------------------------+ +| Flash Attention | 3.0.0.post1 | ++--------------------------+--------------------------------+ +| hipBLASLt | 0.15.0-8c6919d | ++--------------------------+--------------------------------+ +| Triton | 3.3.0 | ++--------------------------+--------------------------------+ + +.. _amd-pytorch-training-model-support-v256: + +Supported models +================ + +The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators. + +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml + + {% set unified_docker = data.unified_docker.latest %} + {% set model_groups = data.model_groups %} + + .. raw:: html + +
+
+
Workload
+
+ {% for model_group in model_groups %} +
{{ model_group.group }}
+ {% endfor %} +
+
+ +
+
Model
+
+ {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + {% if models|length % 3 == 0 %} +
{{ model.model }}
+ {% else %} +
{{ model.model }}
+ {% endif %} + {% endfor %} + {% endfor %} +
+
+
+ + .. note:: + + Some models require an external license agreement through a third party (for example, Meta). + + .. _amd-pytorch-training-performance-measurements-v256: + + Performance measurements + ======================== + + To evaluate performance, the + `Performance results with AMD ROCm software `_ + page provides reference throughput and latency measurements for training + popular AI models. + + .. note:: + + The performance data presented in + `Performance results with AMD ROCm software `_ + should not be interpreted as the peak performance achievable by AMD + Instinct MI325X and MI300X accelerators or ROCm software. + + System validation + ================= + + Before running AI workloads, it's important to validate that your AMD hardware is configured + correctly and performing optimally. + + If you have already validated your system settings, including aspects like NUMA auto-balancing, you + can skip this step. Otherwise, complete the procedures in the :ref:`System validation and + optimization ` guide to properly configure your system settings + before starting training. + + To test for optimal performance, consult the recommended :ref:`System health benchmarks + `. This suite of tests will help you verify and fine-tune your + system's configuration. + + This Docker image is optimized for specific model configurations outlined + below. Performance can vary for other training workloads, as AMD + doesn’t validate configurations and run conditions outside those described. + + Benchmarking + ============ + + Once the setup is complete, choose between two options to start benchmarking: + + .. tab-set:: + + .. tab-item:: MAD-integrated benchmarking + + Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt + + {% for model_group in model_groups %} + {% for model in model_group.models %} + + .. container:: model-doc {{ model.mad_tag }} + + For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one GPU with the {{ model.precision }} data type on the host machine. + + .. code-block:: shell + + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 + + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the + model are collected in the following path: ``~/MAD/perf.csv``. + + {% endfor %} + {% endfor %} + + .. tab-item:: Standalone benchmarking + + .. rubric:: Download the Docker image and required packages + + Use the following command to pull the Docker image from Docker Hub. + + .. code-block:: shell + + docker pull {{ unified_docker.pull_tag }} + + Run the Docker container. + + .. code-block:: shell + + docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }} + + Use these commands if you exit the ``training_env`` container and need to return to it. + + .. code-block:: shell + + docker start training_env + docker exec -it training_env bash + + In the Docker container, clone the ``__ + repository and navigate to the benchmark scripts directory + ``/workspace/MAD/scripts/pytorch_train``. + + .. code-block:: shell + + git clone https://github.com/ROCm/MAD + cd MAD/scripts/pytorch_train + + .. rubric:: Prepare training datasets and dependencies + + The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. + + .. code-block:: shell + + export HF_TOKEN=$your_personal_hugging_face_access_token + + Run the setup script to install libraries and datasets needed for benchmarking. + + .. code-block:: shell + + ./pytorch_benchmark_setup.sh + + .. container:: model-doc pyt_train_llama-3.1-8b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + .. container:: model-doc pyt_train_llama-3.1-70b + + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + * - ``torchdata`` + - `TorchData `_ + + * - ``tomli`` + - `Tomli `_ + + * - ``tiktoken`` + - `tiktoken `_ + + * - ``blobfile`` + - `blobfile `_ + + * - ``tabulate`` + - `tabulate `_ + + * - ``wandb`` + - `Weights & Biases `_ + + * - ``sentencepiece`` + - `SentencePiece `_ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `_ 2.18.0 + + .. container:: model-doc pyt_train_flux + + ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX: + + .. list-table:: + :header-rows: 1 + + * - Library + - Reference + + * - ``accelerate`` + - `Hugging Face Accelerate `_ + + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 + + * - ``sentencepiece`` + - `SentencePiece `_ 0.2.0 + + * - ``tensorboard`` + - `TensorBoard `_ 2.18.0 + + * - ``csvkit`` + - `csvkit `_ 2.0.1 + + * - ``deepspeed`` + - `DeepSpeed `_ 0.16.2 + + * - ``diffusers`` + - `Hugging Face Diffusers `_ 0.31.0 + + * - ``GitPython`` + - `GitPython `_ 3.1.44 + + * - ``opencv-python-headless`` + - `opencv-python-headless `_ 4.10.0.84 + + * - ``peft`` + - `PEFT `_ 0.14.0 + + * - ``protobuf`` + - `Protocol Buffers `_ 5.29.2 + + * - ``pytest`` + - `PyTest `_ 8.3.4 + + * - ``python-dotenv`` + - `python-dotenv `_ 1.0.1 + + * - ``seaborn`` + - `Seaborn `_ 0.13.2 + + * - ``transformers`` + - `Transformers `_ 4.47.0 + + ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face: + + * `bghira/pseudo-camera-10k `_ + + {% for model_group in model_groups %} + {% for model in model_group.models %} + {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %} + + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Pretraining + + To start the pre-training benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% if model.mad_tag == "pyt_train_llama-3.1-8b" %} + * - ``$datatype`` + - ``BF16`` or ``FP8`` + - Only Llama 3.1 8B supports FP8 precision. + {% else %} + * - ``$datatype`` + - ``BF16`` + - Only Llama 3.1 8B supports FP8 precision. + {% endif %} + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + + {% if model.mad_tag == "pyt_train_flux" %} + .. container:: model-doc {{ model.mad_tag }} + + .. note:: + + Occasionally, downloading the Flux dataset might fail. In the event of this + error, manually download it from Hugging Face at + `black-forest-labs/FLUX.1-dev `_ + and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access + the required dataset. + {% endif %} + {% endif %} + + {% if model_group.tag == "fine-tuning" %} + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Fine-tuning + + To start the fine-tuning benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + * - ``$training_mode`` + - ``finetune_fw`` + - Full weight fine-tuning (BF16 supported) + + * - + - ``finetune_lora`` + - LoRA fine-tuning (BF16 supported) + + * - + - ``finetune_qlora`` + - QLoRA fine-tuning (BF16 supported) + + * - + - ``HF_finetune_lora`` + - LoRA fine-tuning with Hugging Face PEFT + + * - ``$datatype`` + - ``BF16`` + - All models support BF16. + + * - ``$sequence_length`` + - Between 2048 and 16384. + - Sequence length for the language model. + + .. note:: + + {{ model.model }} currently supports the following fine-tuning methods: + + {% for method in model.training_modes %} + * ``{{ method }}`` + {% endfor %} + {% if model.training_modes|length < 4 %} + + The upstream `torchtune `_ repository + does not currently provide YAML configuration files for other combinations of + model to fine-tuning method + However, you can still configure your own YAML files to enable support for + fine-tuning methods not listed here by following existing patterns in the + ``/workspace/torchtune/recipes/configs`` directory. + {% endif %} + {% endif %} + {% endfor %} + {% endfor %} + + .. rubric:: Benchmarking examples + + For examples of benchmarking commands, see ``__. + +Further reading +=============== + +- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide `__. + +- To learn more about system settings and management practices to configure your system for + AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization `_. + +- For a list of other ready-made Docker images for AI with ROCm, see + `AMD Infinity Hub `_. + +Previous versions +================= + +See :doc:`pytorch-training-history` to find documentation for previous releases +of the ``ROCm/pytorch-training`` Docker image. diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst index 46b9daf2f..e7258e07b 100644 --- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst +++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst @@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm PyTorch is an open-source machine learning framework that is widely used for model training with GPU-optimized components for transformer-based models. -The `PyTorch for ROCm training Docker `_ -(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a -model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate -training workloads: +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml -+--------------------------+--------------------------------+ -| Software component | Version | -+==========================+================================+ -| ROCm | 6.3.4 | -+--------------------------+--------------------------------+ -| PyTorch | 2.8.0a0+git7d205b2 | -+--------------------------+--------------------------------+ -| Python | 3.10.17 | -+--------------------------+--------------------------------+ -| Transformer Engine | 1.14.0+2f85f5f2 | -+--------------------------+--------------------------------+ -| Flash Attention | 3.0.0.post1 | -+--------------------------+--------------------------------+ -| hipBLASLt | 0.15.0-8c6919d | -+--------------------------+--------------------------------+ -| Triton | 3.3.0 | -+--------------------------+--------------------------------+ + {% set dockers = data.dockers %} + {% set docker = dockers[0] %} + The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__ + (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a + model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate + training workloads: + + .. list-table:: + :header-rows: 1 + + * - Software component + - Version + + {% for component_name, component_version in docker.components.items() %} + * - {{ component_name }} + - {{ component_version }} + {% endfor %} .. _amd-pytorch-training-model-support: @@ -38,26 +35,27 @@ Supported models ================ The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators. +Some instructions, commands, and training recommendations in this documentation might +vary by model -- select one to get started. .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml - {% set unified_docker = data.unified_docker.latest %} + {% set unified_docker = data.dockers[0] %} {% set model_groups = data.model_groups %} - .. raw:: html
-
Workload
+
Model group
{% for model_group in model_groups %} -
{{ model_group.group }}
+
{{ model_group.group }}
{% endfor %}
-
Model
+
Model variant
{% for model_group in model_groups %} {% set models = model_group.models %} @@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
- .. note:: - Some models require an external license agreement through a third party (for example, Meta). + .. _amd-pytorch-training-supported-training-modes: - .. _amd-pytorch-training-performance-measurements: + The following table lists supported training modes per model. - Performance measurements - ======================== + .. dropdown:: Supported training modes - To evaluate performance, the + .. list-table:: + :header-rows: 1 + + * - Model + - Supported training modes + + {% for model_group in model_groups %} + {% set models = model_group.models %} + {% for model in models %} + * - {{ model.model }} + - ``{{ model.training_modes | join('``, ``') }}`` + + {% endfor %} + {% endfor %} + + .. note:: + + Some model and fine-tuning combinations are not listed. This is + because the `upstream torchtune repository `__ + doesn't provide default YAML configurations for them. + For advanced usage, you can create a custom configuration to enable + unlisted fine-tuning methods by using an existing file in the + ``/workspace/torchtune/recipes/configs`` directory as a template. + +.. _amd-pytorch-training-performance-measurements: + +Performance measurements +======================== + +To evaluate performance, the +`Performance results with AMD ROCm software `_ +page provides reference throughput and latency measurements for training +popular AI models. + +.. note:: + + The performance data presented in `Performance results with AMD ROCm software `_ - page provides reference throughput and latency measurements for training - popular AI models. + should not be interpreted as the peak performance achievable by AMD + Instinct MI325X and MI300X accelerators or ROCm software. - .. note:: +System validation +================= - The performance data presented in - `Performance results with AMD ROCm software `_ - should not be interpreted as the peak performance achievable by AMD - Instinct MI325X and MI300X accelerators or ROCm software. +Before running AI workloads, it's important to validate that your AMD hardware is configured +correctly and performing optimally. - System validation - ================= +If you have already validated your system settings, including aspects like NUMA auto-balancing, you +can skip this step. Otherwise, complete the procedures in the :ref:`System validation and +optimization ` guide to properly configure your system settings +before starting training. - Before running AI workloads, it's important to validate that your AMD hardware is configured - correctly and performing optimally. +To test for optimal performance, consult the recommended :ref:`System health benchmarks +`. This suite of tests will help you verify and fine-tune your +system's configuration. - If you have already validated your system settings, including aspects like NUMA auto-balancing, you - can skip this step. Otherwise, complete the procedures in the :ref:`System validation and - optimization ` guide to properly configure your system settings - before starting training. +This Docker image is optimized for specific model configurations outlined +below. Performance can vary for other training workloads, as AMD +doesn’t test configurations and run conditions outside those described. - To test for optimal performance, consult the recommended :ref:`System health benchmarks - `. This suite of tests will help you verify and fine-tune your - system's configuration. +Run training +============ - This Docker image is optimized for specific model configurations outlined - below. Performance can vary for other training workloads, as AMD - doesn’t validate configurations and run conditions outside those described. +.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml - Benchmarking - ============ + {% set unified_docker = data.dockers[0] %} + {% set model_groups = data.model_groups %} - Once the setup is complete, choose between two options to start benchmarking: + Once the setup is complete, choose between two options to start benchmarking training: .. tab-set:: .. tab-item:: MAD-integrated benchmarking - Clone the ROCm Model Automation and Dashboarding (``__) repository to a local - directory and install the required packages on the host machine. + 1. Clone the ROCm Model Automation and Dashboarding (``__) repository to a local + directory and install the required packages on the host machine. - .. code-block:: shell + .. code-block:: shell - git clone https://github.com/ROCm/MAD - cd MAD - pip install -r requirements.txt + git clone https://github.com/ROCm/MAD + cd MAD + pip install -r requirements.txt {% for model_group in model_groups %} {% for model in model_group.models %} .. container:: model-doc {{ model.mad_tag }} - For example, use this command to run the performance benchmark test on the {{ model.model }} model - using one GPU with the {{ model.precision }} data type on the host machine. + 2. For example, use this command to run the performance benchmark test on the {{ model.model }} model + using one node with the {{ model.precision }} data type on the host machine. - .. code-block:: shell + .. code-block:: shell - export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" - madengine run \ - --tags {{ model.mad_tag }} \ - --keep-model-dir \ - --live-output \ - --timeout 28800 + export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models" + madengine run \ + --tags {{ model.mad_tag }} \ + --keep-model-dir \ + --live-output \ + --timeout 28800 - MAD launches a Docker container with the name - ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the - model are collected in the following path: ``~/MAD/perf.csv``. + MAD launches a Docker container with the name + ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the + model are collected in ``~/MAD/perf.csv``. {% endfor %} {% endfor %} @@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325 .. rubric:: Download the Docker image and required packages - Use the following command to pull the Docker image from Docker Hub. + 1. Use the following command to pull the Docker image from Docker Hub. - .. code-block:: shell + .. code-block:: shell - docker pull {{ unified_docker.pull_tag }} + docker pull {{ unified_docker.pull_tag }} - Run the Docker container. + 2. Run the Docker container. - .. code-block:: shell + .. code-block:: shell - docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }} + docker run -it \ + --device /dev/dri \ + --device /dev/kfd \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME:$HOME \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 64G \ + --name training_env \ + {{ unified_docker.pull_tag }} - Use these commands if you exit the ``training_env`` container and need to return to it. + Use these commands if you exit the ``training_env`` container and need to return to it. - .. code-block:: shell + .. code-block:: shell - docker start training_env - docker exec -it training_env bash + docker start training_env + docker exec -it training_env bash - In the Docker container, clone the ``__ - repository and navigate to the benchmark scripts directory - ``/workspace/MAD/scripts/pytorch_train``. + 3. In the Docker container, clone the ``__ + repository and navigate to the benchmark scripts directory + ``/workspace/MAD/scripts/pytorch_train``. - .. code-block:: shell + .. code-block:: shell - git clone https://github.com/ROCm/MAD - cd MAD/scripts/pytorch_train + git clone https://github.com/ROCm/MAD + cd MAD/scripts/pytorch_train .. rubric:: Prepare training datasets and dependencies - The following benchmarking examples require downloading models and datasets - from Hugging Face. To ensure successful access to gated repos, set your - ``HF_TOKEN``. + 1. The following benchmarking examples require downloading models and datasets + from Hugging Face. To ensure successful access to gated repos, set your + ``HF_TOKEN``. - .. code-block:: shell + .. code-block:: shell - export HF_TOKEN=$your_personal_hugging_face_access_token + export HF_TOKEN=$your_personal_hugging_face_access_token - Run the setup script to install libraries and datasets needed for benchmarking. + 2. Run the setup script to install libraries and datasets needed for benchmarking. - .. code-block:: shell + .. code-block:: shell - ./pytorch_benchmark_setup.sh + ./pytorch_benchmark_setup.sh - .. container:: model-doc pyt_train_llama-3.1-8b + .. container:: model-doc pyt_train_llama-3.1-8b - ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B: + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B: - .. list-table:: - :header-rows: 1 + .. list-table:: + :header-rows: 1 - * - Library - - Reference + * - Library + - Reference - * - ``accelerate`` - - `Hugging Face Accelerate `_ + * - ``accelerate`` + - `Hugging Face Accelerate `_ - * - ``datasets`` - - `Hugging Face Datasets `_ 3.2.0 + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 - .. container:: model-doc pyt_train_llama-3.1-70b + .. container:: model-doc pyt_train_llama-3.1-70b - ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B: + ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B: - .. list-table:: - :header-rows: 1 + .. list-table:: + :header-rows: 1 - * - Library - - Reference + * - Library + - Reference - * - ``datasets`` - - `Hugging Face Datasets `_ 3.2.0 + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 - * - ``torchdata`` - - `TorchData `_ + * - ``torchdata`` + - `TorchData `_ - * - ``tomli`` - - `Tomli `_ + * - ``tomli`` + - `Tomli `_ - * - ``tiktoken`` - - `tiktoken `_ + * - ``tiktoken`` + - `tiktoken `_ - * - ``blobfile`` - - `blobfile `_ + * - ``blobfile`` + - `blobfile `_ - * - ``tabulate`` - - `tabulate `_ + * - ``tabulate`` + - `tabulate `_ - * - ``wandb`` - - `Weights & Biases `_ + * - ``wandb`` + - `Weights & Biases `_ - * - ``sentencepiece`` - - `SentencePiece `_ 0.2.0 + * - ``sentencepiece`` + - `SentencePiece `_ 0.2.0 - * - ``tensorboard`` - - `TensorBoard `_ 2.18.0 + * - ``tensorboard`` + - `TensorBoard `_ 2.18.0 - .. container:: model-doc pyt_train_flux + .. container:: model-doc pyt_train_flux - ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX: + ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX: - .. list-table:: - :header-rows: 1 + .. list-table:: + :header-rows: 1 - * - Library - - Reference + * - Library + - Reference - * - ``accelerate`` - - `Hugging Face Accelerate `_ + * - ``accelerate`` + - `Hugging Face Accelerate `_ - * - ``datasets`` - - `Hugging Face Datasets `_ 3.2.0 + * - ``datasets`` + - `Hugging Face Datasets `_ 3.2.0 - * - ``sentencepiece`` - - `SentencePiece `_ 0.2.0 + * - ``sentencepiece`` + - `SentencePiece `_ 0.2.0 - * - ``tensorboard`` - - `TensorBoard `_ 2.18.0 + * - ``tensorboard`` + - `TensorBoard `_ 2.18.0 - * - ``csvkit`` - - `csvkit `_ 2.0.1 + * - ``csvkit`` + - `csvkit `_ 2.0.1 - * - ``deepspeed`` - - `DeepSpeed `_ 0.16.2 + * - ``deepspeed`` + - `DeepSpeed `_ 0.16.2 - * - ``diffusers`` - - `Hugging Face Diffusers `_ 0.31.0 + * - ``diffusers`` + - `Hugging Face Diffusers `_ 0.31.0 - * - ``GitPython`` - - `GitPython `_ 3.1.44 + * - ``GitPython`` + - `GitPython `_ 3.1.44 - * - ``opencv-python-headless`` - - `opencv-python-headless `_ 4.10.0.84 + * - ``opencv-python-headless`` + - `opencv-python-headless `_ 4.10.0.84 - * - ``peft`` - - `PEFT `_ 0.14.0 + * - ``peft`` + - `PEFT `_ 0.14.0 - * - ``protobuf`` - - `Protocol Buffers `_ 5.29.2 + * - ``protobuf`` + - `Protocol Buffers `_ 5.29.2 - * - ``pytest`` - - `PyTest `_ 8.3.4 + * - ``pytest`` + - `PyTest `_ 8.3.4 - * - ``python-dotenv`` - - `python-dotenv `_ 1.0.1 + * - ``python-dotenv`` + - `python-dotenv `_ 1.0.1 - * - ``seaborn`` - - `Seaborn `_ 0.13.2 + * - ``seaborn`` + - `Seaborn `_ 0.13.2 - * - ``transformers`` - - `Transformers `_ 4.47.0 + * - ``transformers`` + - `Transformers `_ 4.47.0 - ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face: + ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face: - * `bghira/pseudo-camera-10k `_ + * `bghira/pseudo-camera-10k `_ {% for model_group in model_groups %} {% for model in model_group.models %} - {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %} + {% set training_modes = model.training_modes %} + {% set training_mode_descs = { + "pretrain": "Benchmark pre-training.", + "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision." + } %} + {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %} + {% if available_modes %} .. container:: model-doc {{ model.mad_tag }} - .. rubric:: Pretraining + .. rubric:: Pre-training To start the pre-training benchmark, use the following command with the appropriate options. See the following list of options and their descriptions. .. code-block:: shell - ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length - - .. list-table:: - :header-rows: 1 - - * - Name - - Options - - Description - - {% if model.mad_tag == "pyt_train_llama-3.1-8b" %} - * - ``$datatype`` - - ``BF16`` or ``FP8`` - - Only Llama 3.1 8B supports FP8 precision. - {% else %} - * - ``$datatype`` - - ``BF16`` - - Only Llama 3.1 8B supports FP8 precision. - {% endif %} - - * - ``$sequence_length`` - - Sequence length for the language model. - - Between 2048 and 8192. 8192 by default. + ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length {% if model.mad_tag == "pyt_train_flux" %} .. container:: model-doc {{ model.mad_tag }} .. note:: + Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}. + To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6` + Occasionally, downloading the Flux dataset might fail. In the event of this error, manually download it from Hugging Face at `black-forest-labs/FLUX.1-dev `_ and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access the required dataset. {% endif %} - {% endif %} - - {% if model_group.tag == "fine-tuning" %} - .. container:: model-doc {{ model.mad_tag }} - - .. rubric:: Fine-tuning - - To start the fine-tuning benchmark, use the following command with the - appropriate options. See the following list of options and their descriptions. - - .. code-block:: shell - - ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length .. list-table:: :header-rows: 1 @@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325 - Options - Description - * - ``$training_mode`` - - ``finetune_fw`` - - Full weight fine-tuning (BF16 supported) - - * - - - ``finetune_lora`` - - LoRA fine-tuning (BF16 supported) - - * - - - ``finetune_qlora`` - - QLoRA fine-tuning (BF16 supported) - - * - - - ``HF_finetune_lora`` - - LoRA fine-tuning with Hugging Face PEFT + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} * - ``$datatype`` - - ``BF16`` - - All models support BF16. + - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %} + - Only Llama 3.1 8B supports FP8 precision. + + * - ``$sequence_length`` + - Sequence length for the language model. + - Between 2048 and 8192. 8192 by default. + {% endif %} + + {% set training_mode_descs = { + "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).", + "finetune_lora": "LoRA fine-tuning (BF16 supported).", + "finetune_qlora": "QLoRA fine-tuning (BF16 supported).", + "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.", + } %} + {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %} + {% if available_modes %} + .. container:: model-doc {{ model.mad_tag }} + + .. rubric:: Fine-tuning + + To start the fine-tuning benchmark, use the following command with the + appropriate options. See the following list of options and their descriptions. + See :ref:`supported training modes `. + + .. code-block:: shell + + ./pytorch_benchmark_report.sh -t $training_mode \ + -m {{ model.model_repo }} \ + -p $datatype \ + -s $sequence_length + + .. list-table:: + :header-rows: 1 + + * - Name + - Options + - Description + + {% for mode in available_modes %} + * - {% if loop.first %}``$training_mode``{% endif %} + - ``{{ mode }}`` + - {{ training_mode_descs[mode] }} + {% endfor %} + + * - ``$datatype`` + - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %} + - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %} * - ``$sequence_length`` - Between 2048 and 16384. - Sequence length for the language model. + {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %} .. note:: - {{ model.model }} currently supports the following fine-tuning methods: + For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B), + use the following torchtune commit for compatibility: - {% for method in model.training_modes %} - * ``{{ method }}`` - {% endfor %} - {% if model.training_modes|length < 4 %} + .. code-block:: shell + + git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e + + {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %} + .. note:: + + You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of + input tensor should be smaller than max_seq_len (4096)``. + This error indicates that an input sequence is longer than the model's maximum context window. + + Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096 + tokens in this case). You can resolve this by truncating the input or splitting + it into smaller chunks before passing it to the model. + + Note on reproducibility: The results in this guide are based on + commit ``b4c98ac`` from the upstream + ``__ repository. For the + latest updates, you can use the main branch. - The upstream `torchtune `_ repository - does not currently provide YAML configuration files for other combinations of - model to fine-tuning method - However, you can still configure your own YAML files to enable support for - fine-tuning methods not listed here by following existing patterns in the - ``/workspace/torchtune/recipes/configs`` directory. {% endif %} {% endif %} {% endfor %} {% endfor %} - .. rubric:: Benchmarking examples + .. rubric:: Benchmarking examples - For examples of benchmarking commands, see ``__. + For examples of benchmarking commands, see ``__. + +Multi-node training +------------------- + +Pre-training +~~~~~~~~~~~~ + +Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B. + +To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + # In the MAD repository + cd scripts/pytorch_train + sbatch run_slurm_train.sh + +Fine-tuning +~~~~~~~~~~~ + +Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B. + +To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository. + +.. code-block:: shell + + huggingface-cli login # Get access to HF Llama model space + huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally + # In the MAD repository + cd scripts/pytorch_train + sbatch Torchtune_Multinode.sh + +.. note:: + + Information regarding benchmark setup: + + * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``. + * You can adjust the torchtune `YAML configuration file + `__ + if you're using a different model. + * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``. + * Set the ``mounting_paths`` inside the SLURM script. + +Once the run is finished, you can find the log files in the ``result_torchtune/`` directory. Further reading ===============