Adding hipDNN links

2026-02-01 09:55:00 -05:00 · 2026-01-30 10:42:06 -05:00
26 changed files with 235 additions and 3814 deletions
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -39,6 +39,7 @@ additional licenses. Please review individual repositories for more information.
 | [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
 | [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
 | [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
 | [hipDNN](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipdnn/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipdnn/LICENSE.md) |
 | [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
 | [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
 | [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -138,14 +138,12 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.11", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.11", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
@@ -156,12 +154,10 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.11", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.11", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,13 +1,15 @@
 docker:
-  pull_tag: rocm/primus:v26.1
+  pull_tag: rocm/primus:v25.10
-  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
    Primus: 0.3.0
    Primus Turbo: 0.1.1
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
-    Transformer Engine: 2.6.0.dev0+f141f34b
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
-    hipBLASLt: 34459f66ea
+    hipBLASLt: 1.2.0-09ab7153e2
    Triton: 3.4.0
    RCCL: 2.27.7
 model_groups:
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.11-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.11-benchmark-models.yaml
@@ -1,47 +0,0 @@
 docker:
  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
    hipBLASLt: 1.2.0-09ab7153e2
    Triton: 3.4.0
    RCCL: 2.27.7
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
  - group: DeepSeek
    tag: deepseek
    models:
      - model: DeepSeek-V3 (proxy)
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
  - group: Mistral AI
    tag: mistral
    models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
      - model: Mixtral 8x22B (proxy)
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
  - group: Qwen
    tag: qwen
    models:
      - model: Qwen 2.5 7B
        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
      - model: Qwen 2.5 72B
        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.11-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.11-benchmark-models.yaml
@@ -1,58 +0,0 @@
 docker:
  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
    hipBLASLt: 1.2.0-09ab7153e2
    Triton: 3.4.0
    RCCL: 2.27.7
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
      - model: Llama 3.3 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
        config_name: llama3.3_70B-pretrain.yaml
      - model: Llama 3.1 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
        config_name: llama3.1_70B-pretrain.yaml
      - model: Llama 3.1 8B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
        config_name: llama3.1_8B-pretrain.yaml
      - model: Llama 2 7B
        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
        config_name: llama2_7B-pretrain.yaml
      - model: Llama 2 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
        config_name: llama2_70B-pretrain.yaml
  - group: DeepSeek
    tag: deepseek
    models:
      - model: DeepSeek-V3 (proxy)
        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
        config_name: deepseek_v3-pretrain.yaml
      - model: DeepSeek-V2-Lite
        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
        config_name: deepseek_v2_lite-pretrain.yaml
  - group: Mistral AI
    tag: mistral
    models:
      - model: Mixtral 8x7B
        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
        config_name: mixtral_8x7B_v0.1-pretrain.yaml
      - model: Mixtral 8x22B (proxy)
        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
        config_name: mixtral_8x22B_v0.1-pretrain.yaml
  - group: Qwen
    tag: qwen
    models:
      - model: Qwen 2.5 7B
        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
        config_name: primus_qwen2.5_7B-pretrain.yaml
      - model: Qwen 2.5 72B
        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
@@ -1,32 +0,0 @@
 docker:
  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
      - model: Llama 3.1 8B
        mad_tag: primus_pyt_train_llama-3.1-8b
        model_repo: Llama-3.1-8B
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: BF16
      - model: Llama 3.1 70B
        mad_tag: primus_pyt_train_llama-3.1-70b
        model_repo: Llama-3.1-70B
        url: https://huggingface.co/meta-llama/Llama-3.1-70B
        precision: BF16
  - group: DeepSeek
    tag: deepseek
    models:
      - model: DeepSeek V3 16B
        mad_tag: primus_pyt_train_deepseek-v3-16b
        model_repo: DeepSeek-V3
        url: https://huggingface.co/deepseek-ai/DeepSeek-V3
        precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
@@ -1,195 +0,0 @@
 docker:
  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
      model_repo: Llama-4-17B_16E
      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.3 70B
      mad_tag: pyt_train_llama-3.3-70b
      model_repo: Llama-3.3-70B
      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
      precision: BF16
      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 3.2 1B
      mad_tag: pyt_train_llama-3.2-1b
      model_repo: Llama-3.2-1B
      url: https://huggingface.co/meta-llama/Llama-3.2-1B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.2 3B
      mad_tag: pyt_train_llama-3.2-3b
      model_repo: Llama-3.2-3B
      url: https://huggingface.co/meta-llama/Llama-3.2-3B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.2 Vision 11B
      mad_tag: pyt_train_llama-3.2-vision-11b
      model_repo: Llama-3.2-Vision-11B
      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
      precision: BF16
      training_modes: [finetune_fw]
    - model: Llama 3.2 Vision 90B
      mad_tag: pyt_train_llama-3.2-vision-90b
      model_repo: Llama-3.2-Vision-90B
      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
      precision: BF16
      training_modes: [finetune_fw]
    - model: Llama 3.1 8B
      mad_tag: pyt_train_llama-3.1-8b
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
      training_modes: [pretrain, finetune_fw, finetune_lora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
      training_modes: [finetune_qlora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3 70B
      mad_tag: pyt_train_llama-3-70b
      model_repo: Llama-3-70B
      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 2 7B
      mad_tag: pyt_train_llama-2-7b
      model_repo: Llama-2-7B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 2 13B
      mad_tag: pyt_train_llama-2-13b
      model_repo: Llama-2-13B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 2 70B
      mad_tag: pyt_train_llama-2-70b
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_lora, finetune_qlora]
  - group: OpenAI
    tag: openai
    models:
    - model: GPT OSS 20B
      mad_tag: pyt_train_gpt_oss_20b
      model_repo: GPT-OSS-20B
      url: https://huggingface.co/openai/gpt-oss-20b
      precision: BF16
      training_modes: [HF_finetune_lora]
    - model: GPT OSS 120B
      mad_tag: pyt_train_gpt_oss_120b
      model_repo: GPT-OSS-120B
      url: https://huggingface.co/openai/gpt-oss-120b
      precision: BF16
      training_modes: [HF_finetune_lora]
  - group: DeepSeek
    tag: deepseek
    models:
    - model: DeepSeek V2 16B
      mad_tag: primus_pyt_train_deepseek-v2
      model_repo: DeepSeek-V2
      url: https://huggingface.co/deepseek-ai/DeepSeek-V2
      precision: BF16
      training_modes: [pretrain]
  - group: Qwen
    tag: qwen
    models:
    - model: Qwen 3 8B
      mad_tag: pyt_train_qwen3-8b
      model_repo: Qwen3-8B
      url: https://huggingface.co/Qwen/Qwen3-8B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Qwen 3 32B
      mad_tag: pyt_train_qwen3-32b
      model_repo: Qwen3-32
      url: https://huggingface.co/Qwen/Qwen3-32B
      precision: BF16
      training_modes: [finetune_lora]
    - model: Qwen 2.5 32B
      mad_tag: pyt_train_qwen2.5-32b
      model_repo: Qwen2.5-32B
      url: https://huggingface.co/Qwen/Qwen2.5-32B
      precision: BF16
      training_modes: [finetune_lora]
    - model: Qwen 2.5 72B
      mad_tag: pyt_train_qwen2.5-72b
      model_repo: Qwen2.5-72B
      url: https://huggingface.co/Qwen/Qwen2.5-72B
      precision: BF16
      training_modes: [finetune_lora]
    - model: Qwen 2 1.5B
      mad_tag: pyt_train_qwen2-1.5b
      model_repo: Qwen2-1.5B
      url: https://huggingface.co/Qwen/Qwen2-1.5B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Qwen 2 7B
      mad_tag: pyt_train_qwen2-7b
      model_repo: Qwen2-7B
      url: https://huggingface.co/Qwen/Qwen2-7B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
  - group: Stable Diffusion
    tag: sd
    models:
    - model: Stable Diffusion XL
      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
      model_repo: SDXL
      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
      precision: BF16
      training_modes: [posttrain]
  - group: Flux
    tag: flux
    models:
    - model: FLUX.1-dev
      mad_tag: pyt_train_flux
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
      training_modes: [posttrain]
  - group: NCF
    tag: ncf
    models:
    - model: NCF
      mad_tag: pyt_ncf_training
      model_repo:
      url: https://github.com/ROCm/FluxBenchmark
      precision: FP32
  - group: DLRM
    tag: dlrm
    models:
    - model: DLRM v2
      mad_tag: pyt_train_dlrm
      model_repo: DLRM
      url: https://github.com/AMD-AGI/DLRMBenchmark
      training_modes: [pretrain]
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,13 +1,13 @@
 docker:
-  pull_tag: rocm/primus:v26.1
+  pull_tag: rocm/primus:v25.11
-  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
-    Transformer Engine: 2.6.0.dev0+f141f34b
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
-    hipBLASLt: 34459f66ea
+    hipBLASLt: 1.2.0-09ab7153e2
    Triton: 3.4.0
    RCCL: 2.27.7
 model_groups:
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,13 +1,13 @@
 docker:
-  pull_tag: rocm/primus:v26.1
+  pull_tag: rocm/primus:v25.11
-  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
-    Transformer Engine: 2.6.0.dev0+f141f34b
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
-    hipBLASLt: 34459f66ea
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,13 +1,15 @@
 docker:
-  pull_tag: rocm/primus:v26.1
+  pull_tag: rocm/primus:v25.10
-  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
    Primus: 0.3.0
    Primus Turbo: 0.1.1
    PyTorch: 2.10.0.dev20251112+rocm7.1
    Python: "3.10"
-    Transformer Engine: 2.6.0.dev0+f141f34b
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
    Flash Attention: 2.8.3
-    hipBLASLt: 34459f66ea
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -52,7 +52,7 @@ accelerate training workloads:
              - {{ component_version }}
            {% endfor %}
-   .. _amd-megatron-lm-model-support-v26.01:
+   .. _amd-megatron-lm-model-support-v25.11:
   Supported models
   ================
@@ -97,7 +97,7 @@ accelerate training workloads:
   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).
-.. _amd-megatron-lm-performance-measurements-v26.01:
+.. _amd-megatron-lm-performance-measurements-v25.11:
 Performance measurements
 ========================
@@ -129,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
-.. _mi300x-amd-megatron-lm-training-v26.01:
+.. _mi300x-amd-megatron-lm-training-v25.11:
 Environment setup
 =================
@@ -138,7 +138,7 @@ Use the following instructions to set up the environment, configure the script t
 reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
 image.
-.. _amd-megatron-lm-requirements-v26.01:
+.. _amd-megatron-lm-requirements-v25.11:
 Download the Docker image
 -------------------------
@@ -190,7 +190,7 @@ Download the Docker image
 The Docker container hosts a verified commit of
 `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
-.. _amd-megatron-lm-environment-setup-v26.01:
+.. _amd-megatron-lm-environment-setup-v25.11:
 Configuration
 =============
@@ -200,39 +200,39 @@ Configuration
   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
 .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v26.01>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.
 .. note::
-   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v26.01>` for more information on configuration options.
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v25.11>` for more information on configuration options.
 Multi-node configuration
 ------------------------
@@ -240,7 +240,7 @@ Multi-node configuration
 Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
 training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
-.. _amd-megatron-lm-tokenizer-v26.01:
+.. _amd-megatron-lm-tokenizer-v25.11:
 Tokenizer
 ---------
@@ -377,7 +377,7 @@ Download the dataset
   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
   Remember to either pre-download the tokenizer or setup Hugging Face access
-   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v26.01>` section.
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v25.11>` section.
   .. note::
@@ -479,13 +479,13 @@ Download the dataset
   Ensure that the files are accessible inside the Docker container.
-.. _amd-megatron-lm-run-training-v26.01:
+.. _amd-megatron-lm-run-training-v25.11:
 Run training
 ============
 Use the following example commands to set up the environment, configure
-:ref:`key options <amd-megatron-lm-benchmark-test-vars-v26.01>`, and run training on
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v25.11>`, and run training on
 MI300X Series GPUs with the AMD Megatron-LM environment.
 Before starting training, export the following environment variables.
@@ -920,7 +920,7 @@ Single node training
          RECOMPUTE_ACTIVATIONS=full \
          CKPT_FORMAT=torch_dist
-.. _amd-megatron-lm-multi-node-examples-v26.01:
+.. _amd-megatron-lm-multi-node-examples-v25.11:
 Multi-node training examples
 ----------------------------
@@ -971,7 +971,7 @@ training on 16 nodes, try the following command:
   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
-.. _amd-megatron-lm-benchmark-test-vars-v26.01:
+.. _amd-megatron-lm-benchmark-test-vars-v25.11:
 Key options
 -----------
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources
-   * - v26.1 (latest)
+   * - v25.11
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
     -
       * :doc:`Primus Megatron documentation <../primus-megatron>`
       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
   * - v25.11
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
     -
       * :doc:`Primus Megatron documentation <primus-megatron-v25.11>`
       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.10>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3>`__
   * - v25.10
     -
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
@@ -37,7 +37,7 @@ GPUs containing essential components, including PyTorch, ROCm libraries, and
 Megatron-LM utilities. It contains the following software components to
 accelerate training workloads:
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
   .. tab-set::
@@ -146,7 +146,7 @@ image.
 Download the Docker image
 -------------------------
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
   {% set docker = data.docker %}
   1. Use the following command to pull the Docker image from Docker Hub.
@@ -811,7 +811,7 @@ Single node training
      Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
      for large iterations.
      For stability, it's recommended to use Primus for this workload.
-      See :doc:`../primus-megatron`.
+      See :doc:`primus-megatron`.
 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.11.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.11.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
@@ -25,10 +25,10 @@ model training. Performance acceleration is powered by `Primus Turbo
   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`.
+   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
-   training <../megatron-lm>` workflow. To learn how to migrate workloads from
+   training <megatron-lm>` workflow. To learn how to migrate workloads from
   Megatron-LM to Primus with Megatron, see
   :doc:`megatron-lm-primus-migration-guide`.
@@ -36,7 +36,7 @@ AMD provides a ready-to-use Docker images for MI355X, MI350X,
 MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and
 Megatron-LM.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   .. tab-set::
@@ -63,7 +63,7 @@ The following models are pre-optimized for performance on AMD Instinct GPUs.
 Some instructions, commands, and training examples in this documentation
 might vary by model -- select one to get started.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
@@ -120,7 +120,7 @@ system's configuration.
 Environment setup
 =================
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   Use the following instructions to set up the environment, configure the script to train models, and
   reproduce the benchmark results on AMD Instinct GPUs.
@@ -129,7 +129,7 @@ Environment setup
 Pull the Docker image
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set docker = data.docker %}
@@ -175,7 +175,7 @@ Configuration
 Primus defines a training configuration in YAML for each model in
 `examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   {% for model_group in model_groups %}
@@ -805,7 +805,7 @@ To run training on multiple nodes, you can use the
 `run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
 to launch the multi-node workload. Use the following steps to setup your environment:
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set docker = data.docker %}
   .. code-block:: shell
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.11.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.11.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
@@ -24,17 +24,17 @@ Primus now supports the PyTorch torchtitan backend.
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
   Primus with the PyTorch torchtitan backend is designed to replace the
-   :doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
+   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
-   :doc:`../pytorch-training` to see steps to run workloads without Primus.
+   :doc:`pytorch-training` to see steps to run workloads without Primus.
 AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
 MI300X GPUs containing essential components for Primus and PyTorch training
 with Primus Turbo optimizations.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
   .. tab-set::
@@ -61,7 +61,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
 Some instructions, commands, and training recommendations in this documentation might
 vary by model -- select one to get started.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
@@ -96,7 +96,7 @@ vary by model -- select one to get started.
 .. seealso::
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
-   see the documentation :doc:`../pytorch-training` (without Primus)
+   see the documentation :doc:`pytorch-training` (without Primus)
 .. _amd-primus-pytorch-performance-measurements-v2510:
@@ -122,7 +122,7 @@ doesn’t test configurations and run conditions outside those described.
 Pull the Docker image
 =====================
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
   Use the following command to pull the Docker image from Docker Hub.
@@ -134,11 +134,11 @@ Run training
 ============
 Once the setup is complete, choose between the following two workflows to start benchmarking training.
-For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
+For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
 For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
 tweak some configurations (such as batch sizes).
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.11.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.11.rst
@@ -1,422 +0,0 @@
 :orphan:
 .. meta::
   :description: How to train a model using PyTorch for ROCm.
   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
 ****************************************
 Training a model with Primus and PyTorch
 ****************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm Primus PyTorch training
   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
 `Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
 LLM training framework designed to streamline training. It streamlines LLM
 training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
 Primus now supports the PyTorch torchtitan backend.
 .. note::
   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
   Primus with the PyTorch torchtitan backend is designed to replace the
   :doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
   :doc:`../pytorch-training` to see steps to run workloads without Primus.
 AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
 MI300X GPUs containing essential components for Primus and PyTorch training
 with Primus Turbo optimizations.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
   .. tab-set::
      .. tab-item:: {{ data.docker.pull_tag }}
         :sync: {{ data.docker.pull_tag }}
         .. list-table::
            :header-rows: 1
            * - Software component
              - Version
            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
 .. _amd-primus-pytorch-model-support-v25.11:
 Supported models
 ================
 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
 Some instructions, commands, and training recommendations in this documentation might
 vary by model -- select one to get started.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
         <div class="row gx-0">
            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>
         <div class="row gx-0 pt-1">
            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
            </div>
         </div>
      </div>
 .. seealso::
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
   see the documentation :doc:`../pytorch-training` (without Primus)
 .. _amd-primus-pytorch-performance-measurements-v25.11:
 System validation
 =================
 Before running AI workloads, it's important to validate that your AMD hardware is configured
 correctly and performing optimally.
 If you have already validated your system settings, including aspects like NUMA auto-balancing, you
 can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
 optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.
 To test for optimal performance, consult the recommended :ref:`System health benchmarks
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 This Docker image is optimized for specific model configurations outlined
 below. Performance can vary for other training workloads, as AMD
 doesn’t test configurations and run conditions outside those described.
 Pull the Docker image
 =====================
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull {{ data.docker.pull_tag }}
 Run training
 ============
 Once the setup is complete, choose between the following two workflows to start benchmarking training.
 For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
 For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
 tweak some configurations (such as batch sizes).
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.11-benchmark-models.yaml
   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}
   .. tab-set::
      .. tab-item:: MAD-integrated benchmarking
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
         .. container:: model-doc {{ model.mad_tag }}
            The following run command is tailored to {{ model.model }}.
            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
               .. code-block:: shell
                  git clone https://github.com/ROCm/MAD
                  cd MAD
                  pip install -r requirements.txt
            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
               using one node with the {{ model.precision }} data type on the host machine.
               .. code-block:: shell
                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
                  madengine run \
                      --tags {{ model.mad_tag }} \
                      --keep-model-dir \
                      --live-output \
                      --timeout 28800
               MAD launches a Docker container with the name
               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
               model are collected in ``~/MAD/perf.csv``.
      {% endfor %}
   {% endfor %}
      .. tab-item:: Primus benchmarking
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
         .. container:: model-doc {{ model.mad_tag }}
            The following run commands are tailored to {{ model.model }}.
            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
            .. rubric:: Download the Docker image and required packages
            1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
               .. code-block:: shell
                  docker pull {{ docker.pull_tag }}
            2. Run the Docker container.
               .. code-block:: shell
                  docker run -it \
                      --device /dev/dri \
                      --device /dev/kfd \
                      --network host \
                      --ipc host \
                      --group-add video \
                      --cap-add SYS_PTRACE \
                      --security-opt seccomp=unconfined \
                      --privileged \
                      -v $HOME:$HOME \
                      -v $HOME/.ssh:/root/.ssh \
                      --shm-size 64G \
                      --name training_env \
                      {{ docker.pull_tag }}
               Use these commands if you exit the ``training_env`` container and need to return to it.
               .. code-block:: shell
                  docker start training_env
                  docker exec -it training_env bash
               The Docker container hosts verified commit ``c4c083de`` of the `Primus
               <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
            .. rubric:: Prepare training datasets and dependencies
            The following benchmarking examples require downloading models and datasets
            from Hugging Face. To ensure successful access to gated repos, set your
            ``HF_TOKEN``.
            .. code-block:: shell
               export HF_TOKEN=$your_personal_hugging_face_access_token
            .. rubric:: Pretraining
            To get started, navigate to the ``Primus`` directory in your container.
            .. code-block::
               cd /workspace/Primus
            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
            included with Primus with the appropriate options.
            .. rubric:: Benchmarking examples
            .. container:: model-doc primus_pyt_train_llama-3.1-8b
               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
               .. tab-set::
                  .. tab-item:: MI355X and MI350X
                     :sync: MI355X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh --training.local_batch_size 6
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh
               To train Llama 3.1 8B with FP8 precision, use the following command.
               .. tab-set::
                  .. tab-item:: MI355X and MI350X
                     :sync: MI355X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh --training.local_batch_size 7
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh
            .. container:: model-doc primus_pyt_train_llama-3.1-70b
               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
               .. tab-set::
                  .. tab-item:: MI355X and MI350X
                     :sync: MI355X and MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh --training.local_batch_size 6
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
                        bash examples/run_pretrain.sh
               To train Llama 3.1 70B with FP8 precision, use the following command.
               .. tab-set::
                  .. tab-item:: MI355X and MI350X
                     :sync: MI355X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh --training.local_batch_size 5
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
                        bash examples/run_pretrain.sh
            .. container:: model-doc primus_pyt_train_deepseek-v3-16b
               Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan.
               .. tab-set::
                  .. tab-item:: MI355X and MI350X
                     :sync: MI355X and MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
                        bash examples/run_pretrain.sh
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
                        bash examples/run_pretrain.sh --training.local_batch_size 10
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
                        bash examples/run_pretrain.sh
      {% endfor %}
   {% endfor %}
 Further reading
 ===============
 - For an introduction to Primus, see `Primus: A Lightweight, Unified Training
  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 - To learn more about system settings and management practices to configure your system for
  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
 Previous versions
 =================
 See :doc:`pytorch-training-history` to find documentation for previous releases
 of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,30 +16,21 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources
   * - v26.1 (latest)
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
     -
       * :doc:`Primus PyTorch training documentation <../primus-megatron>`
       * :doc:`PyTorch training (legacy) documentation <../megatron-lm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v26.1/images/sha256-4fc8808bdb14117c6af7f38d79c809056e6fdbfd530c1fabbb61d097ddaf820d>`__
   * - v25.11
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
     -
-       * :doc:`Primus PyTorch training documentation <primus-pytorch-v25.11>`
+       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
-       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.11>`
+       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.11/images/sha256-71aa65a9bfc8e9dd18bce5b68c81caff864f223e9afa75dc1b719671a1f4a3c3>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
   * - v25.10
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
     -
-       * :doc:`Primus PyTorch training documentation <primus-pytorch-v25.10>`
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.10>`
       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
@@ -49,7 +40,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
       * Primus 0.3.0
       * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
     -
-       * :doc:`Primus PyTorch training documentation <primus-pytorch-v25.9>`
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
       * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
       * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
@@ -59,7 +50,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
       * ROCm 6.4.3
       * PyTorch 2.8.0a0+gitd06a406
     -
-       * :doc:`Primus PyTorch training documentation <primus-pytorch-v25.8>`
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.8>`
       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.8>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
@@ -30,7 +30,7 @@ environment for fine-tuning and pretraining a model on AMD Instinct MI325X
 and MI300X GPUs. It includes the following software components to accelerate
 training workloads:
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
   .. tab-set::
@@ -58,7 +58,7 @@ MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
 training recommendations in this documentation might vary by model -- select
 one to get started.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
@@ -94,7 +94,7 @@ one to get started.
 The following table lists supported training modes per model.
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. dropdown:: Supported training modes
@@ -164,7 +164,7 @@ doesn’t test configurations and run conditions outside those described.
 Run training
 ============
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.11.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.11.rst
@@ -1,669 +0,0 @@
 :orphan:
 .. meta::
   :description: How to train a model using PyTorch for ROCm.
   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
 **************************************
 Training a model with PyTorch on ROCm
 **************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm PyTorch training
   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
 .. note::
   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
   See :doc:`../primus-pytorch` for details.
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.
 The PyTorch for ROCm training Docker image provides a prebuilt optimized
 environment for fine-tuning and pretraining a model on AMD Instinct MI325X
 and MI300X GPUs. It includes the following software components to accelerate
 training workloads:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
   .. tab-set::
      .. tab-item:: {{ data.docker.pull_tag }}
         :sync: {{ data.docker.pull_tag }}
         .. list-table::
            :header-rows: 1
            * - Software component
              - Version
            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
 .. _amd-pytorch-training-model-support-v25.11:
 Supported models
 ================
 The following models are pre-optimized for performance on the AMD Instinct
 MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
 training recommendations in this documentation might vary by model -- select
 one to get started.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
         <div class="row gx-0">
            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>
         <div class="row gx-0 pt-1">
            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
            </div>
         </div>
      </div>
 .. _amd-pytorch-training-supported-training-modes-v25.11:
 The following table lists supported training modes per model.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. dropdown:: Supported training modes
      .. list-table::
         :header-rows: 1
         * - Model
           - Supported training modes
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
         {% if model.training_modes %}
         * - {{ model.model }}
           - ``{{ model.training_modes | join('``, ``') }}``
         {% endif %}
         {% endfor %}
      {% endfor %}
      .. note::
         Some model and fine-tuning combinations are not listed. This is
         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
         doesn't provide default YAML configurations for them.
         For advanced usage, you can create a custom configuration to enable
         unlisted fine-tuning methods by using an existing file in the
         ``/workspace/torchtune/recipes/configs`` directory as a template.
 .. _amd-pytorch-training-performance-measurements-v25.11:
 Performance measurements
 ========================
 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
 page provides reference throughput and latency measurements for training
 popular AI models.
 .. note::
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
   should not be interpreted as the peak performance achievable by AMD
   Instinct MI325X and MI300X GPUs or ROCm software.
 System validation
 =================
 Before running AI workloads, it's important to validate that your AMD hardware is configured
 correctly and performing optimally.
 If you have already validated your system settings, including aspects like NUMA auto-balancing, you
 can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
 optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.
 To test for optimal performance, consult the recommended :ref:`System health benchmarks
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 This Docker image is optimized for specific model configurations outlined
 below. Performance can vary for other training workloads, as AMD
 doesn’t test configurations and run conditions outside those described.
 Run training
 ============
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.11-benchmark-models.yaml
   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}
   Once the setup is complete, choose between two options to start benchmarking training:
   .. tab-set::
      .. tab-item:: MAD-integrated benchmarking
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
         .. container:: model-doc {{ model.mad_tag }}
            The following run command is tailored to {{ model.model }}.
            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
               .. code-block:: shell
                  git clone https://github.com/ROCm/MAD
                  cd MAD
                  pip install -r requirements.txt
            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
               using one node with the {{ model.precision }} data type on the host machine.
               .. code-block:: shell
                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
                  madengine run \
                      --tags {{ model.mad_tag }} \
                      --keep-model-dir \
                      --live-output \
                      --timeout 28800
               MAD launches a Docker container with the name
               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
               model are collected in ``~/MAD/perf.csv``.
      {% endfor %}
   {% endfor %}
      .. tab-item:: Standalone benchmarking
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
         .. container:: model-doc {{ model.mad_tag }}
            The following commands are tailored to {{ model.model }}.
            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
      {% endfor %}
   {% endfor %}
         .. rubric:: Download the Docker image and required packages
         1. Use the following command to pull the Docker image from Docker Hub.
            .. code-block:: shell
               docker pull {{ docker.pull_tag }}
         2. Launch the Docker container.
            .. code-block:: shell
               docker run -it \
                   --device /dev/dri \
                   --device /dev/kfd \
                   --network host \
                   --ipc host \
                   --group-add video \
                   --cap-add SYS_PTRACE \
                   --security-opt seccomp=unconfined \
                   --privileged \
                   -v $HOME:$HOME \
                   -v $HOME/.ssh:/root/.ssh \
                   --shm-size 64G \
                   --name training_env \
                   {{ docker.pull_tag }}
            Use these commands if you exit the ``training_env`` container and need to return to it.
            .. code-block:: shell
               docker start training_env
               docker exec -it training_env bash
         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
            repository and navigate to the benchmark scripts directory
            ``/workspace/MAD/scripts/pytorch_train``.
            .. code-block:: shell
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/pytorch_train
         .. rubric:: Prepare training datasets and dependencies
         1. The following benchmarking examples require downloading models and datasets
            from Hugging Face. To ensure successful access to gated repos, set your
            ``HF_TOKEN``.
            .. code-block:: shell
               export HF_TOKEN=$your_personal_hugging_face_access_token
         2. Run the setup script to install libraries and datasets needed for benchmarking.
            .. code-block:: shell
               ./pytorch_benchmark_setup.sh
            .. container:: model-doc pyt_train_llama-3.1-8b
               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
               .. list-table::
                  :header-rows: 1
                  * - Library
                    - Reference
                  * - ``accelerate``
                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
                  * - ``datasets``
                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
            .. container:: model-doc pyt_train_llama-3.1-70b
               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
               .. list-table::
                  :header-rows: 1
                  * - Library
                    - Reference
                  * - ``datasets``
                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
                  * - ``torchdata``
                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
                  * - ``tomli``
                    - `Tomli <https://pypi.org/project/tomli/>`__
                  * - ``tiktoken``
                    - `tiktoken <https://github.com/openai/tiktoken>`__
                  * - ``blobfile``
                    - `blobfile <https://pypi.org/project/blobfile/>`__
                  * - ``tabulate``
                    - `tabulate <https://pypi.org/project/tabulate/>`__
                  * - ``wandb``
                    - `Weights & Biases <https://github.com/wandb/wandb>`__
                  * - ``sentencepiece``
                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
                  * - ``tensorboard``
                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
            .. container:: model-doc pyt_train_flux
               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
               .. list-table::
                  :header-rows: 1
                  * - Library
                    - Reference
                  * - ``accelerate``
                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
                  * - ``datasets``
                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
                  * - ``sentencepiece``
                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
                  * - ``tensorboard``
                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
                  * - ``csvkit``
                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
                  * - ``deepspeed``
                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
                  * - ``diffusers``
                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
                  * - ``GitPython``
                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
                  * - ``opencv-python-headless``
                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
                  * - ``peft``
                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
                  * - ``protobuf``
                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
                  * - ``pytest``
                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
                  * - ``python-dotenv``
                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
                  * - ``seaborn``
                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
                  * - ``transformers``
                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
         {% set training_modes = model.training_modes %}
         {% set training_mode_descs = {
            "pretrain": "Benchmark pre-training.",
            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
         } %}
         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
         {% if available_modes %}
         .. container:: model-doc {{ model.mad_tag }}
            .. rubric:: Pretraining
            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
            {% if model.mad_tag == "pyt_train_dlrm" %}
            1. Go to the DLRM directory.
               .. code-block:: shell
                  cd /workspace/DLRMBenchmark
            2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
               run the following script.
               .. code-block:: shell
                  ./launch_training_single_node.sh
               To run with MAD within the Docker container, use the following command.
               .. code-block:: shell
                  ./pytorch_benchmark_report.sh -t pretrain -m DLRM
            {% else %}
            .. code-block:: shell
               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
                   -m {{ model.model_repo }} \
                   -p $datatype \
                   -s $sequence_length
            {% if model.mad_tag == "pyt_train_flux" %}
            .. container:: model-doc {{ model.mad_tag }}
               .. note::
                  Currently, FLUX models are not supported out-of-the-box on this Docker.
                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
                  Occasionally, downloading the Flux dataset might fail. In the event of this
                  error, manually download it from Hugging Face at
                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                  the required dataset.
            {% endif %}
            .. list-table::
               :header-rows: 1
               * - Name
                 - Options
                 - Description
               {% for mode in available_modes %}
               * - {% if loop.first %}``$training_mode``{% endif %}
                 - ``{{ mode }}``
                 - {{ training_mode_descs[mode] }}
               {% endfor %}
               * - ``$datatype``
                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
                 - Only Llama 3.1 8B supports FP8 precision.
               * - ``$sequence_length``
                 - Sequence length for the language model.
                 - Between 2048 and 8192. 8192 by default.
            {% endif %}
         {% endif %}
         {% set training_modes = model.training_modes %}
         {% set training_mode_descs = {
            "posttrain": "Benchmark post-training.",
         } %}
         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
         {% if available_modes %}
         .. container:: model-doc {{ model.mad_tag }}
            .. rubric:: Post-training
            To start the post-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
            .. code-block:: shell
               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
                   -m {{ model.model_repo }} \
                   -p $datatype \
                   -s $sequence_length
            .. list-table::
               :header-rows: 1
               * - Name
                 - Options
                 - Description
               {% for mode in available_modes %}
               * - {% if loop.first %}``$training_mode``{% endif %}
                 - ``{{ mode }}``
                 - {{ training_mode_descs[mode] }}
               {% endfor %}
               * - ``$datatype``
                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
                 - Only Llama 3.1 8B supports FP8 precision.
               * - ``$sequence_length``
                 - Sequence length for the language model.
                 - Between 2048 and 8192. 8192 by default.
         {% endif %}
         {% set training_mode_descs = {
            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
         } %}
         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
         {% if available_modes %}
         .. container:: model-doc {{ model.mad_tag }}
            .. rubric:: Fine-tuning
            To start the fine-tuning benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.
            .. code-block:: shell
               ./pytorch_benchmark_report.sh -t $training_mode \
                   -m {{ model.model_repo }} \
                   -p $datatype \
                   -s $sequence_length
            .. list-table::
               :header-rows: 1
               * - Name
                 - Options
                 - Description
               {% for mode in available_modes %}
               * - {% if loop.first %}``$training_mode``{% endif %}
                 - ``{{ mode }}``
                 - {{ training_mode_descs[mode] }}
               {% endfor %}
               * - ``$datatype``
                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
               * - ``$sequence_length``
                 - Between 2048 and 16384.
                 - Sequence length for the language model.
            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
            .. note::
               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
               use the following torchtune commit for compatibility:
               .. code-block:: shell
                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
            .. note::
               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
               input tensor should be smaller than max_seq_len (4096)``.
               This error indicates that an input sequence is longer than the model's maximum context window.
               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
               tokens in this case). You can resolve this by truncating the input or splitting
               it into smaller chunks before passing it to the model.
               Note on reproducibility: The results in this guide are based on
               commit ``b4c98ac`` from the upstream
               `<https://github.com/pytorch/torchtune>`__ repository. For the
               latest updates, you can use the main branch.
            {% endif %}
         {% endif %}
      {% endfor %}
   {% endfor %}
            .. rubric:: Benchmarking examples
            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
 .. _amd-pytorch-training-multinode-examples-v25.11:
 Multi-node training
 -------------------
 Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
 training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
 Pre-training
 ~~~~~~~~~~~~
 Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
 To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
 .. code-block:: shell
   # In the MAD repository
   cd scripts/pytorch_train
   sbatch run_slurm_train.sh
 Fine-tuning
 ~~~~~~~~~~~
 Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
 To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
 .. code-block:: shell
   huggingface-cli login # Get access to HF Llama model space
   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
   # In the MAD repository
   cd scripts/pytorch_train
   sbatch Torchtune_Multinode.sh
 .. note::
   Information regarding benchmark setup:
   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
   * You can adjust the torchtune `YAML configuration file
     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
     if you're using a different model.
   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
   * Set the ``mounting_paths`` inside the SLURM script.
 Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
 Further reading
 ===============
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 - To learn more about system settings and management practices to configure your system for
  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
 Previous versions
 =================
 See :doc:`pytorch-training-history` to find documentation for previous releases
 of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -47,7 +47,7 @@ Megatron-LM.
              - {{ component_version }}
            {% endfor %}
-.. _amd-primus-megatron-lm-model-support-v26.01:
+.. _amd-primus-megatron-lm-model-support-v25.11:
 Supported models
 ================
@@ -108,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
-.. _mi300x-amd-primus-megatron-lm-training-v26.01:
+.. _mi300x-amd-primus-megatron-lm-training-v25.11:
 Environment setup
 =================
@@ -118,7 +118,7 @@ Environment setup
   Use the following instructions to set up the environment, configure the script to train models, and
   reproduce the benchmark results on AMD Instinct GPUs.
-.. _amd-primus-megatron-lm-requirements-v26.01:
+.. _amd-primus-megatron-lm-requirements-v25.11:
 Pull the Docker image
@@ -157,16 +157,16 @@ Pull the Docker image
         docker start primus_training_env
         docker exec -it primus_training_env bash
-The Docker container hosts verified commit ``9c529cd4`` of the `Primus
+The Docker container hosts verified commit ``c4c083de`` of the `Primus
-<https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167>`__ repository.
+<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
-.. _amd-primus-megatron-lm-environment-setup-v26.01:
+.. _amd-primus-megatron-lm-environment-setup-v25.11:
 Configuration
 =============
 Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167/examples/megatron/configs>`__.
+`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/examples/megatron/configs>`__.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -207,7 +207,7 @@ You can use either mock data or real data for training.
  Ensure that the files are accessible inside the Docker container.
-.. _amd-primus-megatron-lm-tokenizer-v26.01:
+.. _amd-primus-megatron-lm-tokenizer-v25.11:
 Tokenizer
 ---------
@@ -220,7 +220,15 @@ right permissions to access the tokenizer for each model.
   # Export your HF_TOKEN in the workspace
   export HF_TOKEN=<your_hftoken>
-.. _amd-primus-megatron-lm-run-training-v26.01:
+.. note::
   In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
   3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
   ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
   <https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
   definition.
 .. _amd-primus-megatron-lm-run-training-v25.11:
 Run training
 ============
@@ -244,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run pre-training for Llama 3.3 70B BF16, run:
@@ -255,10 +263,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.3_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -270,16 +276,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.3_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run pre-training for Llama 3.1 8B FP8, run:
@@ -290,10 +294,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_8B_fp8.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -305,10 +307,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_8B_fp8.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml
   For Llama 3.1 8B BF16, use the following command:
@@ -319,10 +319,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_8B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -334,16 +332,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_8B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run pre-training for Llama 3.1 70B BF16, run:
@@ -354,10 +350,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -369,10 +363,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml
   To run the training on a single node for Llama 3.1 70B FP8, use the following command.
@@ -389,10 +381,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_70B_fp8.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -404,10 +394,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama3.1_70B_fp8_proxy.log \
+            bash ./examples/run_pretrain.sh \
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
                --train_iters 50 \
                --num_layers 40 \
                --fp8 hybrid \
@@ -417,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run pre-training for Llama 2 7B FP8, run:
@@ -428,10 +416,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama2_7B_fp8.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -443,10 +429,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_llama2_7B_fp8.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml
   To run pre-training for Llama 2 7B BF16, run:
@@ -457,10 +441,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama2_7B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -472,16 +454,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama2_7B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run pre-training for Llama 2 70B BF16, run:
@@ -492,10 +472,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama2_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -507,16 +485,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_llama2_70B.log \
+            bash ./examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V3.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
   use the following command:
@@ -528,10 +504,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
-              --log_file /tmp/primus_deepseek_v3_proxy.log \
+            bash examples/run_pretrain.sh \
              -- train pretrain \
              --config examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
                --num_layers 3 \
                --moe_layer_freq 1 \
                --train_iters 50 \
@@ -548,21 +522,17 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
-              --log_file /tmp/primus_deepseek_v3_proxy.log \
+            bash examples/run_pretrain.sh \
              -- train pretrain \
              --config examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
                --num_layers 3 \
                --moe_layer_freq 1 \
              --micro_batch_size 3 \
              --global_batch_size 192 \
                --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V2-Lite.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
   use the following command:
@@ -574,10 +544,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \
-              --log_file /tmp/primus_deepseek_v2_lite.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs//MI355X/deepseek_v2_lite-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -589,16 +557,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \
-              --log_file /tmp/primus_deepseek_v2_lite.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
   use the following command:
@@ -610,10 +576,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
-              --log_file /tmp/primus_mixtral_8x7B.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -625,16 +589,15 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
-              --log_file /tmp/primus_mixtral_8x7B.log \
+            bash examples/run_pretrain.sh \
-              -- train pretrain \
+                --train_iters 50
              --config examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x22B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
   use the following command:
@@ -646,10 +609,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
-              --log_file /tmp/primus_mixtral_8x22B_proxy.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -661,21 +622,19 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
-              --log_file /tmp/primus_mixtral_8x22B_proxy.log \
+            bash examples/run_pretrain.sh \
-              -- train pretrain \
+                --train_iters 50 \
              --config examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
                --num_layers 4 \
                --pipeline_model_parallel_size 1 \
                --micro_batch_size 1 \
-              --global_batch_size 16 \
+                --global_batch_size 16
              --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run training on a single node for Qwen 2.5 7B BF16, use the following
   command:
@@ -687,10 +646,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_7B.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -702,10 +659,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_7B.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml
   For FP8, use the following command.
@@ -716,10 +671,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_7B_fp8.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -731,16 +684,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_7B_fp8.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml
 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 72B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
@@ -751,10 +702,11 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
         .. code-block:: shell
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_72B.log \
+            bash examples/run_pretrain.sh \
-              -- train pretrain \
+                --train_iters 50 \
-              --config examples/megatron/configs/MI355X/qwen2.5_72B-BF16-pretrain.yaml
+                --micro_batch_size 16 \
                --global_batch_size 256
      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -766,12 +718,10 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1
-            bash runner/primus-cli direct \
+            EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \
-              --log_file /tmp/primus_qwen2.5_72B.log \
+            bash examples/run_pretrain.sh
              -- train pretrain \
              --config examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml
-.. _amd-primus-megatron-multi-node-examples-v26.01:
+.. _amd-primus-megatron-multi-node-examples-v25.11:
 Multi-node training examples
 ----------------------------
@@ -780,7 +730,7 @@ Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure y
 training.
 To run training on multiple nodes, you can use the
-`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/9c529cd4a934a68a880ede036c3e97b792e38167/examples/run_slurm_pretrain.sh>`__
+`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
 to launch the multi-node workload. Use the following steps to setup your environment:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -813,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
   * To find your network interface, you can use ``ip a``.
   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v26.01`) as appropriate.
+   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate.
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Llama 3.1 8B FP8 on 8 nodes, run:
@@ -836,7 +786,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Llama 2 7B FP8 on 8 nodes, run:
@@ -853,7 +803,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Llama 3.1 70B FP8 on 8 nodes, run:
@@ -883,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Llama 2 70B FP8 on 8 nodes, run:
@@ -913,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Llama 3.3 70B FP8 on 8 nodes, run:
@@ -943,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Mixtral 8x7B BF16 on 8 nodes, run:
@@ -961,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v26.01` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.
   To train Qwen2.5 72B FP8 on 8 nodes, run:
@@ -976,7 +926,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
          --global_batch_size 512 \
          --recompute_num_layers 80 \
-.. _amd-primus-megatron-lm-benchmark-test-vars-v26.01:
+.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11:
 Key options
 -----------
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -45,7 +45,7 @@ with Primus Turbo optimizations.
              - {{ component_version }}
            {% endfor %}
-.. _amd-primus-pytorch-model-support-v26.01:
+.. _amd-primus-pytorch-model-support-v25.11:
 Supported models
 ================
@@ -91,7 +91,7 @@ vary by model -- select one to get started.
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
   see the documentation :doc:`pytorch-training` (without Primus)
-.. _amd-primus-pytorch-performance-measurements-v26.01:
+.. _amd-primus-pytorch-performance-measurements-v25.11:
 System validation
 =================
@@ -146,7 +146,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}
            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v26.01` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -184,7 +184,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}
            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v26.01` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.
            .. rubric:: Download the Docker image and required packages
@@ -220,8 +220,8 @@ tweak some configurations (such as batch sizes).
                  docker start training_env
                  docker exec -it training_env bash
-               The Docker container hosts verified commit ``9c529cd4`` of the `Primus
+               The Docker container hosts verified commit ``c4c083de`` of the `Primus
-               <https://github.com/AMD-AGI/Primus/tree/9c529cd4a934a68a880ede036c3e97b792e38167/>`__ repository.
+               <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
            .. rubric:: Prepare training datasets and dependencies
@@ -257,31 +257,24 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B.log \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
                          --training.local_batch_size 6
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml
               To train Llama 3.1 8B with FP8 precision, use the following command.
@@ -292,31 +285,24 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B_fp8.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B_fp8.log \
+                        bash examples/run_pretrain.sh --training.local_batch_size 7 
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
                          --training.local_batch_size 7
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_8B_fp8.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml
            .. container:: model-doc primus_pyt_train_llama-3.1-70b
@@ -329,31 +315,24 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B.log \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
                          --training.local_batch_size 6
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml
               To train Llama 3.1 70B with FP8 precision, use the following command.
@@ -364,31 +343,24 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B_fp8.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B_fp8.log \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5 
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
                          --training.local_batch_size 5
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
-                          --log_file /tmp/primus_llama3.1_70B_fp8.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml
            .. container:: model-doc primus_pyt_train_deepseek-v3-16b
@@ -401,31 +373,24 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
-                          --log_file /tmp/primus_deepseek_v3_16b.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml
                  .. tab-item:: MI325X
                     :sync: MI325X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                          --log_file /tmp/primus_deepseek_v3_16b.log \
+                        bash examples/run_pretrain.sh --training.local_batch_size 10 
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
                          --training.local_batch_size 10
                  .. tab-item:: MI300X
                     :sync: MI300X
                     .. code-block:: shell
-                        bash runner/primus-cli direct \
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                          --log_file /tmp/primus_deepseek_v3_16b.log \
+                        bash examples/run_pretrain.sh
                          -- train pretrain \
                          --config examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml
      {% endfor %}
   {% endfor %}
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -43,7 +43,7 @@ training workloads:
              - {{ component_version }}
            {% endfor %}
-.. _amd-pytorch-training-model-support-v26.01:
+.. _amd-pytorch-training-model-support-v25.11:
 Supported models
 ================
@@ -85,7 +85,7 @@ one to get started.
         </div>
      </div>
-.. _amd-pytorch-training-supported-training-modes-v26.01:
+.. _amd-pytorch-training-supported-training-modes-v25.11:
 The following table lists supported training modes per model.
@@ -120,7 +120,7 @@ The following table lists supported training modes per model.
         unlisted fine-tuning methods by using an existing file in the
         ``/workspace/torchtune/recipes/configs`` directory as a template.
-.. _amd-pytorch-training-performance-measurements-v26.01:
+.. _amd-pytorch-training-performance-measurements-v25.11:
 Performance measurements
 ========================
@@ -176,7 +176,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}
            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v26.01` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -214,7 +214,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}
            The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v26.01` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.
      {% endfor %}
   {% endfor %}
@@ -409,10 +409,6 @@ Run training
            {% if model.mad_tag == "pyt_train_dlrm" %}
            .. note::
               DLRM is supported on MI300X, MI325X, MI350X, and MI355X GPUs.
            1. Go to the DLRM directory.
               .. code-block:: shell
@@ -536,7 +532,7 @@ Run training
            To start the fine-tuning benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v26.01>`.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.
            .. code-block:: shell
@@ -601,7 +597,7 @@ Run training
            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
-.. _amd-pytorch-training-multinode-examples-v26.01:
+.. _amd-pytorch-training-multinode-examples-v25.11:
 Multi-node training
 -------------------
--- a/docs/reference/api-libraries.md
+++ b/docs/reference/api-libraries.md
@@ -18,6 +18,7 @@
 (artificial-intelligence-apis)=
 * {doc}`Composable Kernel <composable_kernel:index>`
 * {doc}`hipDNN <hipdnn:index>`
 * {doc}`MIGraphX <amdmigraphx:index>`
 * {doc}`MIOpen <miopen:index>`
 * {doc}`MIVisionX <mivisionx:index>`
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -36,6 +36,7 @@ Machine Learning & Computer Vision
  :header: "Component", "Description"
  ":doc:`Composable Kernel <composable_kernel:index>`", "Provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures"
  ":doc:`hipDNN <hipdnn:index>`", "A graph-based deep learning library that enables multi-operation fusion for improved performance on AMD GPUs. "
  ":doc:`MIGraphX <amdmigraphx:index>`", "Graph inference engine that accelerates machine learning model inference"
  ":doc:`MIOpen <miopen:index>`", "An open source deep-learning library"
  ":doc:`MIVisionX <mivisionx:index>`", "Set of comprehensive computer vision and machine learning libraries, utilities, and applications"